require(arules)
require(arulesViz)
require(data.table)
require(dplyr)
require(ggplot2)
require(data.table)
#require(xlsx)
require(readxl)
require(ggmap)
#require(proj4)
require("performanceEstimation")
require("e1071")
require("DMwR")
data_dir <- "../data"
df <- read.csv(paste(paste( data_dir, "Accidents_2015.csv", sep= "/" )), stringsAsFactors = FALSE)
# Show some info
(colnames(df))
## [1] "Accident_Index"
## [2] "Location_Easting_OSGR"
## [3] "Location_Northing_OSGR"
## [4] "Longitude"
## [5] "Latitude"
## [6] "Police_Force"
## [7] "Accident_Severity"
## [8] "Number_of_Vehicles"
## [9] "Number_of_Casualties"
## [10] "Date"
## [11] "Day_of_Week"
## [12] "Time"
## [13] "Local_Authority_.District."
## [14] "Local_Authority_.Highway."
## [15] "X1st_Road_Class"
## [16] "X1st_Road_Number"
## [17] "Road_Type"
## [18] "Speed_limit"
## [19] "Junction_Detail"
## [20] "Junction_Control"
## [21] "X2nd_Road_Class"
## [22] "X2nd_Road_Number"
## [23] "Pedestrian_Crossing.Human_Control"
## [24] "Pedestrian_Crossing.Physical_Facilities"
## [25] "Light_Conditions"
## [26] "Weather_Conditions"
## [27] "Road_Surface_Conditions"
## [28] "Special_Conditions_at_Site"
## [29] "Carriageway_Hazards"
## [30] "Urban_or_Rural_Area"
## [31] "Did_Police_Officer_Attend_Scene_of_Accident"
## [32] "LSOA_of_Accident_Location"
(str(df))
## 'data.frame': 140056 obs. of 32 variables:
## $ Accident_Index : chr "201501BS70001" "201501BS70002" "201501BS70004" "201501BS70005" ...
## $ Location_Easting_OSGR : int 525130 526530 524610 524420 524630 525480 526890 527590 524170 525010 ...
## $ Location_Northing_OSGR : int 180050 178560 181080 181080 179040 179530 178940 178660 180930 181200 ...
## $ Longitude : num -0.198 -0.179 -0.206 -0.208 -0.206 ...
## $ Latitude : num 51.5 51.5 51.5 51.5 51.5 ...
## $ Police_Force : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Accident_Severity : int 3 3 3 3 2 3 3 3 3 3 ...
## $ Number_of_Vehicles : int 1 1 1 1 2 2 2 2 2 2 ...
## $ Number_of_Casualties : int 1 1 1 1 1 1 1 1 1 2 ...
## $ Date : chr "12/01/2015" "12/01/2015" "12/01/2015" "13/01/2015" ...
## $ Day_of_Week : int 2 2 2 3 6 5 5 1 6 3 ...
## $ Time : chr "18:45" "07:50" "18:08" "07:40" ...
## $ Local_Authority_.District. : int 12 12 12 12 12 12 12 12 12 12 ...
## $ Local_Authority_.Highway. : chr "E09000020" "E09000020" "E09000020" "E09000020" ...
## $ X1st_Road_Class : int 5 6 4 4 3 3 3 6 5 6 ...
## $ X1st_Road_Number : int 0 0 415 450 315 315 3218 0 0 0 ...
## $ Road_Type : int 6 6 6 6 6 6 6 6 6 6 ...
## $ Speed_limit : int 30 30 30 30 30 30 30 30 30 30 ...
## $ Junction_Detail : int 3 3 2 6 6 3 6 0 3 3 ...
## $ Junction_Control : int 4 4 4 4 2 4 2 -1 4 4 ...
## $ X2nd_Road_Class : int 6 3 6 6 3 5 3 -1 6 6 ...
## $ X2nd_Road_Number : int 0 3218 0 0 3220 0 3218 0 0 0 ...
## $ Pedestrian_Crossing.Human_Control : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Pedestrian_Crossing.Physical_Facilities : int 0 0 1 0 5 4 5 0 1 0 ...
## $ Light_Conditions : int 4 1 4 1 1 1 1 1 1 1 ...
## $ Weather_Conditions : int 1 1 2 1 2 1 8 1 1 8 ...
## $ Road_Surface_Conditions : int 1 1 2 2 2 2 2 1 1 1 ...
## $ Special_Conditions_at_Site : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Carriageway_Hazards : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Urban_or_Rural_Area : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Did_Police_Officer_Attend_Scene_of_Accident: int 1 1 1 2 2 1 1 1 1 1 ...
## $ LSOA_of_Accident_Location : chr "E01002825" "E01002820" "E01002833" "E01002874" ...
## NULL
(ncol(df))
## [1] 32
(nrow(df))
## [1] 140056
## Change messy column names
setnames( df , "Local_Authority_.District." , "Local_Authority_District" )
setnames( df , "Local_Authority_.Highway." , "Local_Authority_Highway" )
setnames( df , "X1st_Road_Class" , "First_Road_Class" )
setnames( df , "X1st_Road_Number" , "First_Road_Number" )
setnames( df , "X2nd_Road_Class" , "Second_Road_Class" )
setnames( df , "X2nd_Road_Number" , "Second_Road_Number" )
setnames( df , "Pedestrian_Crossing.Human_Control" , "Pedestrian_Crossing-Human_Control" )
setnames( df , "Pedestrian_Crossing.Physical_Facilities" , "Pedestrian_Crossing-Physical_Facilities" )
(colnames(df))
## [1] "Accident_Index"
## [2] "Location_Easting_OSGR"
## [3] "Location_Northing_OSGR"
## [4] "Longitude"
## [5] "Latitude"
## [6] "Police_Force"
## [7] "Accident_Severity"
## [8] "Number_of_Vehicles"
## [9] "Number_of_Casualties"
## [10] "Date"
## [11] "Day_of_Week"
## [12] "Time"
## [13] "Local_Authority_District"
## [14] "Local_Authority_Highway"
## [15] "First_Road_Class"
## [16] "First_Road_Number"
## [17] "Road_Type"
## [18] "Speed_limit"
## [19] "Junction_Detail"
## [20] "Junction_Control"
## [21] "Second_Road_Class"
## [22] "Second_Road_Number"
## [23] "Pedestrian_Crossing-Human_Control"
## [24] "Pedestrian_Crossing-Physical_Facilities"
## [25] "Light_Conditions"
## [26] "Weather_Conditions"
## [27] "Road_Surface_Conditions"
## [28] "Special_Conditions_at_Site"
## [29] "Carriageway_Hazards"
## [30] "Urban_or_Rural_Area"
## [31] "Did_Police_Officer_Attend_Scene_of_Accident"
## [32] "LSOA_of_Accident_Location"
´
(apply(is.na(df), 2, sum))
(apply( df, 2, function(x) length( unique(x) )))
# Quick view on data
(head(df))
# Some dummy copy
df_copy <- df
# Lets map things according lookup table from spreadsheet
map <- data.table( number_sheet = 3:21,
name_variable = c( 'Police_Force',
'Accident_Severity',
'Day_of_Week',
'Local_Authority_District',
'Local_Authority_Highway',
'First_Road_Class',
'Road_Type',
'Junction_Detail',
'Junction_Control',
'Second_Road_Class',
'Pedestrian_Crossing-Human_Control',
'Pedestrian_Crossing-Physical_Facilities',
'Light_Conditions',
'Weather_Conditions',
'Road_Surface_Conditions',
'Special_Conditions_at_Site',
'Carriageway_Hazards',
'Urban_or_Rural_Area',
'Did_Police_Officer_Attend_Scene_of_Accident')
)
## Iterate
for( Index in map$number_sheet ){
look<- data.table(read_excel(paste(data_dir, "Road-Accident-Safety-Data-Guide.xls" , sep= "/" ),
sheet = Index))
names(look)[1] <- tolower( names(look)[1])
name_variable <- map[ number_sheet == Index , name_variable ]
df_copy [[name_variable]] <- look[ match( df[[name_variable]], look[['code']]) ]$label
}
# No need for df and look, lets remove from memory to save space
df <- df_copy
rm(look)
rm(df_copy)
# Quick view on data
(head(df))
## Accident_Index Location_Easting_OSGR Location_Northing_OSGR Longitude
## 1 201501BS70001 525130 180050 -0.198465
## 2 201501BS70002 526530 178560 -0.178838
## 3 201501BS70004 524610 181080 -0.205590
## 4 201501BS70005 524420 181080 -0.208327
## 5 201501BS70008 524630 179040 -0.206022
## 6 201501BS70009 525480 179530 -0.193610
## Latitude Police_Force Accident_Severity Number_of_Vehicles
## 1 51.50554 Metropolitan Police Slight 1
## 2 51.49184 Metropolitan Police Slight 1
## 3 51.51491 Metropolitan Police Slight 1
## 4 51.51495 Metropolitan Police Slight 1
## 5 51.49657 Metropolitan Police Serious 2
## 6 51.50079 Metropolitan Police Slight 2
## Number_of_Casualties Date Day_of_Week Time Local_Authority_District
## 1 1 12/01/2015 Monday 18:45 Kensington and Chelsea
## 2 1 12/01/2015 Monday 07:50 Kensington and Chelsea
## 3 1 12/01/2015 Monday 18:08 Kensington and Chelsea
## 4 1 13/01/2015 Tuesday 07:40 Kensington and Chelsea
## 5 1 09/01/2015 Friday 07:30 Kensington and Chelsea
## 6 1 15/01/2015 Thursday 09:56 Kensington and Chelsea
## First_Road_Class First_Road_Number Road_Type Speed_limit
## 1 C 0 Single carriageway 30
## 2 Unclassified 0 Single carriageway 30
## 3 B 415 Single carriageway 30
## 4 B 450 Single carriageway 30
## 5 A 315 Single carriageway 30
## 6 A 315 Single carriageway 30
## Junction_Detail Junction_Control Second_Road_Class
## 1 T or staggered junction Give way or uncontrolled Unclassified
## 2 T or staggered junction Give way or uncontrolled A
## 3 Mini-roundabout Give way or uncontrolled Unclassified
## 4 Crossroads Give way or uncontrolled Unclassified
## 5 Crossroads Auto traffic signal A
## 6 T or staggered junction Give way or uncontrolled C
## Second_Road_Number Pedestrian_Crossing-Human_Control
## 1 0 None within 50 metres
## 2 3218 None within 50 metres
## 3 0 None within 50 metres
## 4 0 None within 50 metres
## 5 3220 None within 50 metres
## 6 0 None within 50 metres
## Pedestrian_Crossing-Physical_Facilities
## 1 No physical crossing facilities within 50 metres
## 2 No physical crossing facilities within 50 metres
## 3 Zebra
## 4 No physical crossing facilities within 50 metres
## 5 Pedestrian phase at traffic signal junction
## 6 Pelican, puffin, toucan or similar non-junction pedestrian light crossing
## Light_Conditions Weather_Conditions Road_Surface_Conditions
## 1 Darkness - lights lit Fine no high winds Dry
## 2 Daylight Fine no high winds Dry
## 3 Darkness - lights lit Raining no high winds Wet or damp
## 4 Daylight Fine no high winds Wet or damp
## 5 Daylight Raining no high winds Wet or damp
## 6 Daylight Fine no high winds Wet or damp
## Special_Conditions_at_Site Carriageway_Hazards Urban_or_Rural_Area
## 1 None None Urban
## 2 None None Urban
## 3 None None Urban
## 4 None None Urban
## 5 None None Urban
## 6 None None Urban
## Did_Police_Officer_Attend_Scene_of_Accident LSOA_of_Accident_Location
## 1 Yes E01002825
## 2 Yes E01002820
## 3 Yes E01002833
## 4 No E01002874
## 5 No E01002814
## 6 Yes E01002816
for ( Index in map$number_sheet ){
name_variable <- map[ number_sheet == Index , name_variable ]
cat ( paste0( "\n\n",name_variable, ":" ))
print ( table( df[[name_variable]] ))
}
##
##
## Police_Force:
## Avon and Somerset Bedfordshire Cambridgeshire
## 3000 1478 1968
## Central Cheshire City of London
## 508 2322 346
## Cleveland Cumbria Derbyshire
## 941 1293 2147
## Devon and Cornwall Dorset Dumfries and Galloway
## 3856 1777 276
## Durham Dyfed-Powys Essex
## 1073 1325 3746
## Fife Gloucestershire Grampian
## 428 815 656
## Greater Manchester Gwent Hampshire
## 3073 689 4365
## Hertfordshire Humberside Kent
## 2510 2394 4878
## Lancashire Leicestershire Lincolnshire
## 3711 2249 2134
## Lothian and Borders Merseyside Metropolitan Police
## 2083 2706 24886
## Norfolk North Wales North Yorkshire
## 1749 1333 2062
## Northamptonshire Northern Northumbria
## 1323 448 3229
## Nottinghamshire South Wales South Yorkshire
## 2750 2196 3066
## Staffordshire Strathclyde Suffolk
## 2599 3590 1486
## Surrey Sussex Tayside
## 3858 4302 475
## Thames Valley Warwickshire West Mercia
## 5190 1504 2297
## West Midlands West Yorkshire Wiltshire
## 6096 5286 1584
##
##
## Accident_Severity:
## Fatal Serious Slight
## 1616 20038 118402
##
##
## Day_of_Week:
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 22374 20032 18114 15258 21479 21431 21368
##
##
## Local_Authority_District:
## Aberdeen City Aberdeenshire
## 227 347
## Adur Allerdale
## 156 247
## Amber Valley Angus
## 213 144
## Argyll and Bute Arun
## 226 313
## Ashfield Ashford
## 269 341
## Aylesbury Vale Babergh
## 361 215
## Barking and Dagenham Barnet
## 482 1062
## Barnsley Barrow-in-Furness
## 499 144
## Basildon Basingstoke and Deane
## 389 340
## Bassetlaw Bath and North East Somerset
## 284 303
## Bedford Bexley
## 399 437
## Birmingham Blaby
## 2970 218
## Blackburn with Darwen Blackpool
## 398 375
## Blaenau Gwent Bolsover
## 64 177
## Bolton Boston
## 373 205
## Bournemouth Bracknell Forest
## 489 212
## Bradford Braintree
## 1235 302
## Breckland Brent
## 278 907
## Brentwood Bridgend
## 187 232
## Brighton and Hove Bristol, City of
## 780 942
## Broadland Bromley
## 230 792
## Bromsgrove Broxbourne
## 181 200
## Broxtowe Burnley
## 223 249
## Bury Caerphilly
## 179 189
## Calderdale Cambridge
## 411 346
## Camden Cannock Chase
## 936 207
## Canterbury Cardiff
## 371 622
## Carlisle Carmarthenshire
## 256 436
## Castle Point Central Bedfordshire
## 171 609
## Ceredigion Charnwood
## 177 303
## Chelmsford Cheltenham
## 360 131
## Cherwell Cheshire East
## 408 837
## Cheshire West and Chester Chesterfield
## 756 194
## Chichester Chiltern
## 347 120
## Chorley Christchurch
## 241 113
## City of London Clackmannanshire
## 344 62
## Colchester Conwy
## 385 250
## Copeland Corby
## 166 90
## Cornwall Cotswold
## 1229 142
## County Durham Coventry
## 866 688
## Craven Crawley
## 161 306
## Croydon Dacorum
## 896 233
## Darlington Dartford
## 207 415
## Daventry Denbighshire
## 188 227
## Derby Derbyshire Dales
## 583 185
## Doncaster Dover
## 855 248
## Dudley Dumfries and Galloway
## 432 276
## Dundee City Ealing
## 131 1014
## East Ayrshire East Cambridgeshire
## 204 145
## East Devon East Dorset
## 301 189
## East Dunbartonshire East Hampshire
## 95 234
## East Hertfordshire East Lindsey
## 235 447
## East Lothian East Northamptonshire
## 158 119
## East Renfrewshire East Riding of Yorkshire
## 94 762
## East Staffordshire Eastbourne
## 344 245
## Eastleigh Eden
## 274 203
## Edinburgh, City of Elmbridge
## 1111 412
## Enfield Epping Forest
## 789 384
## Epsom and Ewell Erewash
## 182 216
## Exeter Falkirk
## 262 249
## Fareham Fenland
## 250 189
## Fife Flintshire
## 428 249
## Forest Heath Forest of Dean
## 128 100
## Fylde Gateshead
## 215 517
## Gedling Glasgow City
## 191 1198
## Gloucester Gosport
## 173 138
## Gravesham Great Yarmouth
## 269 162
## Greenwich Guildford
## 646 563
## Gwynedd Hackney
## 246 857
## Halton Hambleton
## 224 253
## Hammersmith and Fulham Harborough
## 614 182
## Haringey Harlow
## 881 127
## Harrogate Harrow
## 457 424
## Hart Hartlepool
## 211 136
## Hastings Havant
## 237 223
## Havering Herefordshire, County of
## 632 400
## Hertsmere High Peak
## 364 182
## Highland Hillingdon
## 379 724
## Hinckley and Bosworth Horsham
## 200 309
## Hounslow Huntingdonshire
## 842 378
## Hyndburn Inverclyde
## 208 109
## Ipswich Isle of Anglesey
## 260 100
## Isle of Wight Islington
## 315 844
## Kensington and Chelsea Kettering
## 632 148
## King's Lynn and West Norfolk Kingston upon Hull, City of
## 272 748
## Kingston upon Thames Kirklees
## 332 944
## Knowsley Lambeth
## 271 1226
## Lancaster Leeds
## 388 1979
## Leicester Lewes
## 845 253
## Lewisham Lichfield
## 884 227
## Lincoln Liverpool
## 303 1137
## London Airport (Heathrow) Luton
## 39 470
## Maidstone Maldon
## 443 101
## Malvern Hills Manchester
## 171 723
## Mansfield Medway
## 252 656
## Melton Mendip
## 113 188
## Merthyr Tydfil Merton
## 99 497
## Mid Devon Mid Suffolk
## 138 203
## Mid Sussex Middlesbrough
## 363 304
## Midlothian Milton Keynes
## 190 641
## Mole Valley Monmouthshire
## 291 126
## Moray Neath Port Talbot
## 82 216
## New Forest Newark and Sherwood
## 425 321
## Newcastle upon Tyne Newcastle-under-Lyme
## 765 275
## Newham Newport
## 881 232
## North Ayrshire North Devon
## 190 217
## North Dorset North East Derbyshire
## 130 204
## North East Lincolnshire North Hertfordshire
## 435 290
## North Kesteven North Lanarkshire
## 287 447
## North Lincolnshire North Norfolk
## 449 163
## North Somerset North Tyneside
## 339 447
## North Warwickshire North West Leicestershire
## 315 219
## Northampton Northumberland
## 432 716
## Norwich Nottingham
## 374 956
## Nuneaton and Bedworth Oadby and Wigston
## 226 76
## Oldham Orkney Islands
## 289 12
## Oxford Pembrokeshire
## 409 284
## Pendle Perth and Kinross
## 172 200
## Peterborough Plymouth
## 558 619
## Poole Portsmouth
## 314 591
## Powys Preston
## 428 387
## Purbeck Reading
## 142 348
## Redbridge Redcar and Cleveland
## 718 190
## Redditch Reigate and Banstead
## 112 478
## Renfrewshire Rhondda, Cynon, Taff
## 258 395
## Ribble Valley Richmond upon Thames
## 118 401
## Richmondshire Rochdale
## 158 249
## Rochford Rossendale
## 138 145
## Rother Rotherham
## 292 574
## Rugby Runnymede
## 330 371
## Rushcliffe Rushmoor
## 254 184
## Rutland Ryedale
## 93 149
## Salford Sandwell
## 284 651
## Scarborough Scottish Borders
## 245 220
## Sedgemoor Sefton
## 243 476
## Selby Sevenoaks
## 190 380
## Sheffield Shepway
## 1138 283
## Shetland Islands Shropshire
## 25 571
## Slough Solihull
## 465 297
## South Ayrshire South Bucks
## 192 225
## South Cambridgeshire South Derbyshire
## 352 193
## South Gloucestershire South Hams
## 394 211
## South Holland South Kesteven
## 243 359
## South Lakeland South Lanarkshire
## 277 459
## South Norfolk South Northamptonshire
## 270 198
## South Oxfordshire South Ribble
## 331 312
## South Somerset South Staffordshire
## 323 316
## South Tyneside Southampton
## 253 594
## Southend-on-Sea Southwark
## 393 907
## Spelthorne St. Albans
## 308 333
## St. Edmundsbury St. Helens
## 194 317
## Stafford Staffordshire Moorlands
## 322 212
## Stevenage Stirling
## 167 197
## Stockport Stockton-on-Tees
## 230 311
## Stoke-on-Trent Stratford-upon-Avon
## 579 330
## Stroud Suffolk Coastal
## 132 271
## Sunderland Surrey Heath
## 531 249
## Sutton Swale
## 322 361
## Swansea Swindon
## 451 448
## Tameside Tamworth
## 238 117
## Tandridge Taunton Deane
## 410 192
## Teignbridge Telford and Wrekin
## 313 227
## Tendring Test Valley
## 290 263
## Tewkesbury Thanet
## 137 430
## The Vale of Glamorgan Three Rivers
## 181 190
## Thurrock Tonbridge and Malling
## 344 366
## Torbay Torfaen
## 281 78
## Torridge Tower Hamlets
## 140 1066
## Trafford Tunbridge Wells
## 260 315
## Uttlesford Vale of White Horse
## 175 281
## Wakefield Walsall
## 717 523
## Waltham Forest Wandsworth
## 642 984
## Warrington Warwick
## 505 303
## Watford Waveney
## 203 215
## Waverley Wealden
## 326 439
## Wellingborough Welwyn Hatfield
## 148 295
## West Berkshire West Devon
## 294 145
## West Dorset West Dunbartonshire
## 277 118
## West Lancashire West Lindsey
## 264 290
## West Lothian West Oxfordshire
## 404 194
## West Somerset Western Isles
## 76 32
## Westminster Weymouth and Portland
## 1578 123
## Wigan Wiltshire
## 248 1136
## Winchester Windsor and Maidenhead
## 323 338
## Wirral Woking
## 505 268
## Wokingham Wolverhampton
## 264 535
## Worcester Worthing
## 162 262
## Wrexham Wychavon
## 261 266
## Wycombe Wyre
## 299 239
## Wyre Forest York
## 207 449
##
##
## Local_Authority_Highway:< table of extent 0 >
##
##
## First_Road_Class:
## A A(M) B C Motorway Unclassified
## 64280 402 17217 11069 5148 41940
##
##
## Road_Type:
## Dual carriageway One way street Roundabout Single carriageway
## 20266 2873 9589 105066
## Slip road Unknown
## 1456 806
##
##
## Junction_Detail:
## Crossroads Data missing or out of range
## 13762 1
## Mini-roundabout More than 4 arms (not roundabout)
## 1900 999
## Not at junction or within 20 metres Other junction
## 55927 3103
## Private drive or entrance Roundabout
## 5252 12423
## Slip road T or staggered junction
## 2032 44657
##
##
## Junction_Control:
## Authorised person Auto traffic signal
## 172 14456
## Data missing or out of range Give way or uncontrolled
## 56070 68655
## Stop sign
## 703
##
##
## Second_Road_Class:
## A A(M) B C Motorway Unclassified
## 13801 96 5626 5991 851 57088
##
##
## Pedestrian_Crossing-Human_Control:
## Control by other authorised person Control by school crossing patrol
## 433 282
## Data missing or out of range None within 50 metres
## 140 139201
##
##
## Pedestrian_Crossing-Physical_Facilities:
## Central refuge
## 3157
## Data missing or out of range
## 127
## Footbridge or subway
## 392
## No physical crossing facilities within 50 metres
## 113486
## Pedestrian phase at traffic signal junction
## 10865
## Pelican, puffin, toucan or similar non-junction pedestrian light crossing
## 7751
## Zebra
## 4278
##
##
## Light_Conditions:
## Darkness - lighting unknown Darkness - lights lit
## 1920 27551
## Darkness - lights unlit Darkness - no lighting
## 719 7258
## Daylight
## 102608
##
##
## Weather_Conditions:
## Fine + high winds Fine no high winds Fog or mist
## 2353 113949 662
## Other Raining + high winds Raining no high winds
## 1986 2365 15559
## Snowing + high winds Snowing no high winds Unknown
## 175 540 2467
##
##
## Road_Surface_Conditions:
## Data missing or out of range Dry
## 284 101355
## Flood over 3cm. deep Frost or ice
## 184 1461
## Snow Wet or damp
## 535 36237
##
##
## Special_Conditions_at_Site:
## Auto signal part defective
## 51
## Auto traffic signal - out
## 212
## Data missing or out of range
## 106
## Mud
## 424
## None
## 136823
## Oil or diesel
## 384
## Road sign or marking defective or obscured
## 160
## Road surface defective
## 299
## Roadworks
## 1597
##
##
## Carriageway_Hazards:
## Any animal in carriageway (except ridden horse)
## 610
## Data missing or out of range
## 95
## None
## 137749
## Other object on road
## 929
## Pedestrian in carriageway - not injured
## 307
## Previous accident
## 204
## Vehicle load on road
## 162
##
##
## Urban_or_Rural_Area:
## Rural Urban
## 47976 92080
##
##
## Did_Police_Officer_Attend_Scene_of_Accident:
## No
## 27064
## No - accident was reported using a self completion form (self rep only)
## 377
## Yes
## 112607
th <- 9
# Subset to number of vehicles larger that threshold
df_large_crashes <- df["Number_of_Vehicles" > th, ]
barplot(prop.table(table(df_large_crashes$Road_Type)), las=2, cex.names=.5)
*Note: Its kind of obvious that highways and single Carriage account for the majority of cars involved
# Adding Spped_limit
(df_large_crashes %>% group_by(Road_Type, Speed_limit) %>%
tally() %>%
arrange(desc(n)))
## # A tibble: 37 x 3
## # Groups: Road_Type [6]
## Road_Type Speed_limit n
## <chr> <int> <int>
## 1 Single carriageway 30 74667
## 2 Single carriageway 60 17081
## 3 Dual carriageway 70 8527
## 4 Single carriageway 40 6691
## 5 Roundabout 30 6117
## 6 Dual carriageway 30 5524
## 7 Single carriageway 20 3575
## 8 Dual carriageway 40 3468
## 9 Single carriageway 50 3051
## 10 One way street 30 2489
## # … with 27 more rows
*Note: High Speed limits account for majority of cars involved (obvious). Considering the close proximity, the Road Type (Dual Carriage) and the Speed limit(70), we may find some areas were major crash in series occurs. Perhaps some more attention is needed to those spots by local Authorities.
load(file= "map.Rdata")
UK <- get_map(location = 'England', zoom=8)
map <- ggmap(UK)
´
# Conver to datatable for easy encoding
df_copy <-data.table(df)
# Dummy copy
df_copy[, Accident_Index := NULL]
## Convert factor date to date
df_copy[, Date := as.Date( as.character( Date ), format = "%d/%m/%Y" ) ]
df_copy[, Day_of_year := as.Date( format( Date, "%m-%d" ), format = "%m-%d" )]
# Now as factors (data as Winter, Spring, Summer, Fall) (To be precise in the days)
df_copy[, Season := ifelse( Day_of_year >= as.Date( "03-20", format = "%m-%d" ) & Day_of_year < as.Date( "06-21", format = "%m-%d" ) ,"Spring",
ifelse( Day_of_year >= as.Date( "06-21", format = "%m-%d" ) & Day_of_year < as.Date( "09-22", format = "%m-%d" ) ,"Summer",
ifelse( Day_of_year >= as.Date( "09-22", format = "%m-%d" ) & Day_of_year <= as.Date( "12-21", format = "%m-%d" ) ,"Fall", "Winter")))]
# Get the month of the year
df_copy[, c( "Month", "Day_of_year" ) := list( factor( format( Date, "%b" )),NULL ) ]
# Conver hour:minutes only to hours
df_copy[, Hour := as.numeric( substr( Time,1,2 )) ]
# Convert time to Morning, Afternoon, Night and Evening (Reduce Granularity)
breaks <- c( 0, 6, 12, 18, 23 )
labels <- c( "Night", "Morning", "Afternoon", "Evening" )
df_copy[, Day_Period := cut( as.numeric( Hour ), breaks = breaks,
labels = labels , include.lowest=TRUE )]
# Check conversion CHECK
(apply(is.na(df_copy), 2, sum)) # Time 18 not being right converted
## Location_Easting_OSGR
## 27
## Location_Northing_OSGR
## 27
## Longitude
## 27
## Latitude
## 27
## Police_Force
## 0
## Accident_Severity
## 0
## Number_of_Vehicles
## 0
## Number_of_Casualties
## 0
## Date
## 0
## Day_of_Week
## 0
## Time
## 0
## Local_Authority_District
## 0
## First_Road_Class
## 0
## First_Road_Number
## 0
## Road_Type
## 0
## Speed_limit
## 0
## Junction_Detail
## 0
## Junction_Control
## 0
## Second_Road_Class
## 56603
## Second_Road_Number
## 0
## Pedestrian_Crossing-Human_Control
## 0
## Pedestrian_Crossing-Physical_Facilities
## 0
## Light_Conditions
## 0
## Weather_Conditions
## 0
## Road_Surface_Conditions
## 0
## Special_Conditions_at_Site
## 0
## Carriageway_Hazards
## 0
## Urban_or_Rural_Area
## 0
## Did_Police_Officer_Attend_Scene_of_Accident
## 8
## LSOA_of_Accident_Location
## 0
## Season
## 0
## Month
## 0
## Hour
## 18
## Day_Period
## 18
# Factorize number casualties ("Small", "Medium", "Large", "Overweelming")
breaks <- c( "0", "2", "4", "12", "inf" )
labels <- c( "Small", "Medium", "Large", "Overweelming" )
df_copy[, "Casualties_Class" := cut( Number_of_Casualties , breaks = breaks ,
labels = labels , include.lowest=TRUE )]
# Summarize number of vehicles to (small, medium, high)
breaks <- c( "0", "2", "8", "inf" )
labels <- c( "Small", "Medium", "Large" )
df_copy [, "Number_Vehicles_Class" := cut( Number_of_Vehicles, breaks = breaks,
labels=labels , include.lowest=TRUE )]
df_copy$Speed_limit <- as.factor( df_copy$Speed_limit )
# Local copy to maintain the analisys further in the same domain.
df_subset <- df_copy
(table(df_subset$Accident_Severity))
##
## Fatal Serious Slight
## 1616 20038 118402
barplot(prop.table(table(df_subset$Accident_Severity)),las=2, cex.names=.5)
*Note: Majority of accidents are minor
barplot(prop.table(table(df_subset$Number_of_Casualties)), cex.names=.5)
ggplot(df_subset, aes(Casualties_Class,fill =Accident_Severity)) +
geom_bar()+
theme_bw() +
geom_bar( position = "fill")+
theme(axis.text.x=element_text(angle=90, hjust=1))
*Note: We observer that when a number of veichles are involved, the larger the number of deaths (highways)
barplot(prop.table(table(df_subset$Day_of_Week)), las=2, cex.names=.5)
*Note: On Sundays we see lesst accidents, (people stay at home or drive more relaxed with family). Also we some increase on fridays, some hurry to reach home, our madness of the start of the weekend..
barplot(prop.table(table(df_subset$Speed_limit)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$Road_Type)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$Road_Surface_Conditions)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$Season)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$Month)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$Hour)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$Day_Period)), las=2, cex.names=.5) ## ok obvious, last info tells 17h
barplot(prop.table(table(df_subset$Urban_or_Rural_Area)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$Light_Conditions)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$First_Road_Class)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$First_Road_Class)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$`Pedestrian_Crossing-Physical_Facilities`)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$Police_Force)), las=2, cex.names=.5)
barplot(prop.table(table(df_subset$Police_Force)), las=2, cex.names=.5)
ggplot(df_subset, aes(Weather_Conditions, fill = Accident_Severity))+
geom_bar()+
theme_bw()+
geom_bar( position = "fill")+
theme_bw()+
theme(axis.text.x=element_text(angle=90, hjust=1))
*Note: Most accicnet occur on metroploita with no severe weather conditions…
Select Some relevant columns tha may be associated with accident values
*Note: Computing this values takes to long, we load them previous
load(file ="df_subset_Ent.Rdata")
load(file ="entropy.gains.Rdata")
load(file ="correlation.filter.selection.Rdata")
library(FSelector)
# Dummy copy
df_rules <- df_model
rm(df_model)
# Create an subset to see the correlation of the variables to give clues about their relevance.
df_rules_ent <- df_rules[, setdiff( names( df_rules),c( "Longitude",
"Latitude",
"First_Road_Number",
"Second_Road_Number",
"Number_of_Casualties" ,
"Time",
"Date",
"Number_of_Vehicles",
"Hour",
"LSOA_of_Accident_Location" ,
"Location_Easting_OSGR",
"Location_Northing_OSGR" )),
with=F]
# Check the entropy gains regarding the Accident Severity
entropy.gains <- information.gain ( Accident_Severity~. , df_rules_ent )
# Compute the corollation
correlation.filter.selection <- cfs ( Accident_Severity~. , df_rules_ent )
# Select the variables to use
variable.use <- unique( c( cutoff.k( entropy.gains, 11 ), correlation.filter.selection ))
# Show the choices
(variable.use)
## Create the subset with the entropy suggestions for laler use.
subset_entropy <- df_subset[, c( "Accident_Severity", variable.use ), with=F ]
df_rules <- df_subset
# Convert the remainder values to factors
df_rules$Season <- as.factor(df_rules$Season)
df_rules$Hour <- as.factor(df_rules$Hour)
# Here we make our initial selection.
subset <- subset(df_rules, select =c("Accident_Severity",
"Speed_limit",
"Light_Conditions",
"Urban_or_Rural_Area",
"Month",
"Pedestrian_Crossing-Physical_Facilities",
"Day_Period",
"Road_Type",
"Road_Surface_Conditions",
"Junction_Detail",
"Junction_Control",
"Casualties_Class",
"Weather_Conditions",
"Season"))
# Convert then to transacions
trans <- as(subset, "transactions")
summary(trans)
## transactions as itemMatrix in sparse format with
## 140056 rows (elements/itemsets/transactions) and
## 85 columns (items) and a density of 0.1647044
##
## most frequent items:
## Casualties_Class=Small
## 130533
## Accident_Severity=Slight
## 118402
## Weather_Conditions=Fine no high winds
## 113949
## Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres
## 113486
## Road_Type=Single carriageway
## 105066
## (Other)
## 1379330
##
## element (itemset/transaction) length distribution:
## sizes
## 13 14
## 18 140038
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13 14 14 14 14 14
##
## includes extended item information - examples:
## labels variables levels
## 1 Accident_Severity=Fatal Accident_Severity Fatal
## 2 Accident_Severity=Serious Accident_Severity Serious
## 3 Accident_Severity=Slight Accident_Severity Slight
##
## includes extended transaction information - examples:
## transactionID
## 1 1
## 2 2
## 3 3
# No longles need subset
rm(subset)
# Check some items labels
head(itemLabels(trans))
## [1] "Accident_Severity=Fatal" "Accident_Severity=Serious"
## [3] "Accident_Severity=Slight" "Speed_limit=0"
## [5] "Speed_limit=10" "Speed_limit=20"
# Check the item frequency (top 25)
itemFrequencyPlot(trans, topN=25, cex.names=.5)
*Note: Focus in the Accident_Severity=Fatal
trans_sub <- subset(trans, items %in% "Accident_Severity=Fatal")
itemFrequencyPlot(trans_sub, topN = 25, population = trans, cex.names=.5)
# Order By lift
itemFrequencyPlot(trans_sub, topN = 25, population = trans, lift=TRUE, cex.names=.5)
*Note: The graph manifest that the Accident_Severity=Fatal appear frequently as the no physical crossing facilities within 50 meters also dry conditions and rural areas.
trans_sub <- subset(trans, items %in% "Light_Conditions=Darkness - lights lit")
itemFrequencyPlot(trans_sub, topN = 25, population = trans, cex.names=.5)
itemFrequencyPlot(trans_sub, topN = 25, population = trans, lift=TRUE, cex.names=.5)
*Note: The graph manifest that the light conditions - Darkness lights lit Urbarn_or_Rural_Area=Urban and Speed_limit=30 appear frequently, what makes sense to a urban city area.
trans_sub <- subset(trans, items %in% "Road_Surface_Conditions=Wet or damp")
itemFrequencyPlot(trans_sub, topN = 25, population = trans, cex.names=.5)
itemFrequencyPlot(trans_sub, topN = 25, population = trans, lift=TRUE, cex.names=.5)
*Note: The graph manifest that the the Road_Surface_Conditions=Wet_or_damp apper frequently with Road_Type=Single Carriageway. Also ordering by lift we may find a strong co-occurence of Surface_Conditions=Wet or damp with the winter months (Nov, Dec, Jan) what is obvious.
trans_sub <- subset(trans, items %in% "Day_Period=Night")
itemFrequencyPlot(trans_sub, topN = 25, population = trans, cex.names=.5)
itemFrequencyPlot(trans_sub, topN = 25, population = trans, lift=TRUE, cex.names=.5)
*Note: Analysing the lift graph, we find that the Day_period=Night co-occcur toguether with Light_Conditions=Darkneess - lights unlit and Darkness -no Lighting and we also may find a small co-occurence of the fatal accidents severety. The lift value is not very high.
# Drop trans_sub so save memory
rm(trans_sub)
# Find an interesting support (have at least 500 transactions)
(500/nrow(trans))
## [1] 0.003570001
# User parameters
min_supp <- 0.0036
min_conf <- 0.5
min_lift <- 0.6
min_len <- 2
max_len <- 4
## Generate frequent itemsets
itemsets <- apriori(trans, parameter = list(target = "frequent",
supp=min_supp, minlen = min_len, maxlen =max_len))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.0036 2
## maxlen target ext
## 4 frequent itemsets TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 504
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[85 item(s), 140056 transaction(s)] done [0.10s].
## sorting and recoding items ... [75 item(s)] done [0.02s].
## creating transaction tree ... done [0.08s].
## checking subsets of size 1 2 3 4 done [0.34s].
## sorting transactions ... done [0.04s].
## writing ... [53285 set(s)] done [0.01s].
## creating S4 object ... done [0.05s].
inspect(head(sort(itemsets), n=10))
## items support transIdenticalToItemsets count
## [1] {Accident_Severity=Slight,
## Casualties_Class=Small} 0.7925116 0 110996
## [2] {Casualties_Class=Small,
## Weather_Conditions=Fine no high winds} 0.7590892 0 106315
## [3] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,
## Casualties_Class=Small} 0.7518707 0 105304
## [4] {Road_Type=Single carriageway,
## Casualties_Class=Small} 0.7022405 0 98353
## [5] {Road_Surface_Conditions=Dry,
## Weather_Conditions=Fine no high winds} 0.6947650 0 97306
## [6] {Accident_Severity=Slight,
## Weather_Conditions=Fine no high winds} 0.6863397 0 96126
## [7] {Light_Conditions=Daylight,
## Casualties_Class=Small} 0.6847547 0 95904
## [8] {Accident_Severity=Slight,
## Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres} 0.6813489 0 95427
## [9] {Road_Surface_Conditions=Dry,
## Casualties_Class=Small} 0.6765080 0 94749
## [10] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,
## Weather_Conditions=Fine no high winds} 0.6569801 0 92014
*Note: Here we can see that the majority of the accidents severety are Slight and the number of casualties small.
Another info is that the small casuality occur mostly without existence of any pedestrian physiscal facilities.
Another info is that the majority of the small accidents occur in the Single carriage road type.
quality(itemsets)$lift <- interestMeasure(itemsets, measure="lift", trans = trans)
## Lets analyse
inspect(head(sort(itemsets, by = "lift"), n=10))
## items support transIdenticalToItemsets count lift
## [1] {Speed_limit=70,
## Urban_or_Rural_Area=Rural,
## Road_Type=Dual carriageway,
## Junction_Detail=Slip road} 0.004341121 0 608 87.83085
## [2] {Speed_limit=70,
## Road_Type=Dual carriageway,
## Junction_Detail=Slip road,
## Junction_Control=Give way or uncontrolled} 0.005255041 0 736 74.29733
## [3] {Light_Conditions=Darkness - lights lit,
## Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,
## Junction_Detail=Crossroads,
## Junction_Control=Auto traffic signal} 0.011345462 0 1589 73.30477
## [4] {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,
## Road_Type=Dual carriageway,
## Junction_Detail=Crossroads,
## Junction_Control=Auto traffic signal} 0.007775461 0 1089 68.29762
## [5] {Urban_or_Rural_Area=Urban,
## Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,
## Junction_Detail=Crossroads,
## Junction_Control=Auto traffic signal} 0.033279545 0 4661 64.33681
## [6] {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,
## Day_Period=Evening,
## Junction_Detail=Crossroads,
## Junction_Control=Auto traffic signal} 0.007554121 0 1058 58.95044
## [7] {Speed_limit=30,
## Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,
## Junction_Detail=Crossroads,
## Junction_Control=Auto traffic signal} 0.029566745 0 4141 58.72352
## [8] {Month=Nov,
## Road_Surface_Conditions=Wet or damp,
## Weather_Conditions=Raining + high winds,
## Season=Fall} 0.005119381 0 717 49.62872
## [9] {Speed_limit=70,
## Light_Conditions=Darkness - no lighting,
## Urban_or_Rural_Area=Rural,
## Road_Type=Dual carriageway} 0.008717941 0 1221 49.38169
## [10] {Speed_limit=40,
## Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,
## Road_Type=Dual carriageway,
## Junction_Control=Auto traffic signal} 0.004676701 0 655 47.89292
plot(head(sort(itemsets_subset, by = "lift"), n=10), method = "graph", control=list(cex=.8))
*Note: A quick analisys shows that Accident_Severity=Fatal, Light_Conditions=Darkness - lights lit, Road_Type=Roundabout, Junction_Detail=Roundabout have an reasonable lift suggesting that they Co-occurr together, what makes sense regarding the fact that suggest a city crossing, (City))
# Generate the subset transactions containing only overwellming number casualties
trans_sub <- subset(trans, items %in% "Casualties_Class=Overweelming")
# Generate the itemsets
itemsets_subset <- apriori(trans_sub, parameter = list(target = "frequent",
supp=min_supp, minlen = min_len, maxlen=max_len))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.0036 2
## maxlen target ext
## 4 frequent itemsets TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 0
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[52 item(s), 18 transaction(s)] done [0.00s].
## sorting and recoding items ... [52 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## sorting transactions ... done [0.00s].
## writing ... [16234 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(sort(itemsets_subset), n=10))
## items support transIdenticalToItemsets count
## [1] {Light_Conditions=Daylight,
## Casualties_Class=Overweelming} 0.9444444 0 17
## [2] {Road_Surface_Conditions=Dry,
## Casualties_Class=Overweelming} 0.8888889 0 16
## [3] {Road_Surface_Conditions=Dry,
## Weather_Conditions=Fine no high winds} 0.8333333 0 15
## [4] {Casualties_Class=Overweelming,
## Weather_Conditions=Fine no high winds} 0.8333333 0 15
## [5] {Light_Conditions=Daylight,
## Road_Surface_Conditions=Dry} 0.8333333 0 15
## [6] {Road_Surface_Conditions=Dry,
## Casualties_Class=Overweelming,
## Weather_Conditions=Fine no high winds} 0.8333333 0 15
## [7] {Light_Conditions=Daylight,
## Road_Surface_Conditions=Dry,
## Casualties_Class=Overweelming} 0.8333333 0 15
## [8] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,
## Casualties_Class=Overweelming} 0.7777778 0 14
## [9] {Light_Conditions=Daylight,
## Weather_Conditions=Fine no high winds} 0.7777778 0 14
## [10] {Light_Conditions=Daylight,
## Road_Surface_Conditions=Dry,
## Weather_Conditions=Fine no high winds} 0.7777778 0 14
itemsets_subset <- subset(itemsets_subset, subset = items %in% "Casualties_Class=Overweelming")
quality(itemsets_subset)$lift <- interestMeasure(itemsets_subset,
measure="lift", trans = trans_sub)
# Lets analyse
inspect(head(sort(itemsets_subset, by = "lift"), n=10))
## items support transIdenticalToItemsets count lift
## [1] {Speed_limit=60,
## Road_Type=Roundabout,
## Junction_Detail=Roundabout,
## Casualties_Class=Overweelming} 0.05555556 0 1 324
## [2] {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,
## Junction_Detail=More than 4 arms (not roundabout),
## Casualties_Class=Overweelming,
## Weather_Conditions=Raining no high winds} 0.05555556 0 1 324
## [3] {Accident_Severity=Fatal,
## Month=Feb,
## Casualties_Class=Overweelming,
## Weather_Conditions=Fog or mist} 0.05555556 0 1 324
## [4] {Light_Conditions=Darkness - lights lit,
## Day_Period=Night,
## Road_Type=One way street,
## Casualties_Class=Overweelming} 0.05555556 0 1 324
## [5] {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,
## Road_Surface_Conditions=Wet or damp,
## Junction_Detail=More than 4 arms (not roundabout),
## Casualties_Class=Overweelming} 0.05555556 0 1 162
## [6] {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,
## Junction_Detail=More than 4 arms (not roundabout),
## Junction_Control=Auto traffic signal,
## Casualties_Class=Overweelming} 0.05555556 0 1 162
## [7] {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,
## Road_Surface_Conditions=Wet or damp,
## Casualties_Class=Overweelming,
## Weather_Conditions=Raining no high winds} 0.05555556 0 1 162
## [8] {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,
## Junction_Control=Auto traffic signal,
## Casualties_Class=Overweelming,
## Weather_Conditions=Raining no high winds} 0.05555556 0 1 162
## [9] {Road_Surface_Conditions=Wet or damp,
## Junction_Detail=More than 4 arms (not roundabout),
## Casualties_Class=Overweelming,
## Weather_Conditions=Raining no high winds} 0.05555556 0 1 162
## [10] {Junction_Detail=More than 4 arms (not roundabout),
## Junction_Control=Auto traffic signal,
## Casualties_Class=Overweelming,
## Weather_Conditions=Raining no high winds} 0.05555556 0 1 162
plot(head(sort(itemsets_subset, by = "lift"), n=10), method = "graph", control=list(cex=.8))
*Note: {Accident_Severity=Fatal, Month=feb., Casualties_Class=Overweelming,Weather_Conditions=Fog or mist} We may find that fog, february, Fatal accidnet and number of casuaties high co-occur frequently. An typical conditions to have major accidents
rules <- apriori(trans, parameter = list(supp=min_supp, maxlen=max_len, target="rules"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.0036 1
## maxlen target ext
## 4 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 504
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[85 item(s), 140056 transaction(s)] done [0.09s].
## sorting and recoding items ... [75 item(s)] done [0.02s].
## creating transaction tree ... done [0.08s].
## checking subsets of size 1 2 3 4 done [0.32s].
## writing ... [52078 rule(s)] done [0.01s].
## creating S4 object ... done [0.05s].
inspect(head(sort(rules, by="lift"), n=10))
## lhs rhs support confidence coverage lift count
## [1] {Speed_limit=60,
## Day_Period=Evening,
## Season=Fall} => {Light_Conditions=Darkness - no lighting} 0.004184041 0.8060523 0.005190781 15.55421 586
## [2] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Junction_Detail=Not at junction or within 20 metres} => {Speed_limit=70} 0.008418061 0.9013761 0.009339121 13.11481 1179
## [3] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Junction_Control=Data missing or out of range} => {Speed_limit=70} 0.008418061 0.9013761 0.009339121 13.11481 1179
## [4] {Accident_Severity=Slight,
## Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.007475581 0.9010327 0.008296681 13.10981 1047
## [5] {Light_Conditions=Darkness - no lighting,
## Urban_or_Rural_Area=Rural,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.008717941 0.8931968 0.009760382 12.99580 1221
## [6] {Light_Conditions=Darkness - no lighting,
## Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.009446222 0.8921106 0.010588622 12.98000 1323
## [7] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Weather_Conditions=Fine no high winds} => {Speed_limit=70} 0.006326041 0.8913481 0.007097161 12.96890 886
## [8] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Road_Surface_Conditions=Dry} => {Speed_limit=70} 0.004619581 0.8911846 0.005183641 12.96652 647
## [9] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.009474782 0.8900067 0.010645742 12.94938 1327
## [10] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Season=Fall} => {Speed_limit=70} 0.003955561 0.8892456 0.004448221 12.93831 554
plot(rules)
(200/nrow(trans))
## [1] 0.001428
min_supp = 0.0015
# Generate the items
rules <- apriori(trans, parameter = list(supp=min_supp, maxlen=max_len, target ="rules"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.0015 1
## maxlen target ext
## 4 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 210
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[85 item(s), 140056 transaction(s)] done [0.09s].
## sorting and recoding items ... [77 item(s)] done [0.02s].
## creating transaction tree ... done [0.08s].
## checking subsets of size 1 2 3 4 done [0.32s].
## writing ... [75935 rule(s)] done [0.01s].
## creating S4 object ... done [0.05s].
inspect(head(sort(rules, by="lift"), n=10))
## lhs rhs support confidence coverage lift count
## [1] {Speed_limit=60,
## Month=Oct,
## Day_Period=Evening} => {Light_Conditions=Darkness - no lighting} 0.001599360 0.8296296 0.001927800 16.00918 224
## [2] {Speed_limit=60,
## Day_Period=Evening,
## Season=Fall} => {Light_Conditions=Darkness - no lighting} 0.004184041 0.8060523 0.005190781 15.55421 586
## [3] {Light_Conditions=Darkness - no lighting,
## Month=Dec,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.001563660 0.9201681 0.001699320 13.38823 219
## [4] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Season=Winter} => {Speed_limit=70} 0.003420061 0.9037736 0.003784201 13.14969 479
## [5] {Light_Conditions=Darkness - no lighting,
## Day_Period=Night,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.002941680 0.9035088 0.003255841 13.14584 412
## [6] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Junction_Detail=Not at junction or within 20 metres} => {Speed_limit=70} 0.008418061 0.9013761 0.009339121 13.11481 1179
## [7] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Junction_Control=Data missing or out of range} => {Speed_limit=70} 0.008418061 0.9013761 0.009339121 13.11481 1179
## [8] {Accident_Severity=Slight,
## Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.007475581 0.9010327 0.008296681 13.10981 1047
## [9] {Light_Conditions=Darkness - no lighting,
## Day_Period=Afternoon,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.002606100 0.8990148 0.002898840 13.08045 365
## [10] {Light_Conditions=Darkness - no lighting,
## Month=Jan,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.001549380 0.8966942 0.001727880 13.04669 217
plot(rules)
*Note: We may find some rules that area obvious to us such as Speed_limit=60, Month=Nov, Day_Period=Evening} => {Light_Conditions=Darkness - no lighting}. In order to obtain more relevant ones we gona focus in the cases that acciddents are fatal.
r_subset <- subset(rules, subset = items %in% "Accident_Severity=Fatal")
## Inspect
inspect(head(sort(r_subset, by="lift"), 10))
## lhs rhs support confidence coverage lift count
## [1] {Accident_Severity=Fatal,
## Speed_limit=60,
## Day_Period=Afternoon} => {Urban_or_Rural_Area=Rural} 0.001685040 0.9915966 0.001699320 2.894761 236
## [2] {Accident_Severity=Fatal,
## Speed_limit=60,
## Light_Conditions=Daylight} => {Urban_or_Rural_Area=Rural} 0.002770320 0.9897959 0.002798880 2.889504 388
## [3] {Accident_Severity=Fatal,
## Speed_limit=60,
## Road_Type=Single carriageway} => {Urban_or_Rural_Area=Rural} 0.003855601 0.9854015 0.003912721 2.876676 540
## [4] {Accident_Severity=Fatal,
## Speed_limit=60,
## Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres} => {Urban_or_Rural_Area=Rural} 0.004019821 0.9842657 0.004084081 2.873360 563
## [5] {Accident_Severity=Fatal,
## Speed_limit=60,
## Road_Surface_Conditions=Dry} => {Urban_or_Rural_Area=Rural} 0.002727480 0.9820051 0.002777460 2.866761 382
## [6] {Accident_Severity=Fatal,
## Speed_limit=60,
## Junction_Control=Data missing or out of range} => {Urban_or_Rural_Area=Rural} 0.003113040 0.9819820 0.003170161 2.866693 436
## [7] {Accident_Severity=Fatal,
## Speed_limit=60,
## Junction_Detail=Not at junction or within 20 metres} => {Urban_or_Rural_Area=Rural} 0.003105900 0.9819413 0.003163021 2.866574 435
## [8] {Accident_Severity=Fatal,
## Speed_limit=60,
## Weather_Conditions=Fine no high winds} => {Urban_or_Rural_Area=Rural} 0.003320101 0.9810127 0.003384361 2.863863 465
## [9] {Accident_Severity=Fatal,
## Speed_limit=60} => {Urban_or_Rural_Area=Rural} 0.004034101 0.9809028 0.004112641 2.863543 565
## [10] {Accident_Severity=Fatal,
## Speed_limit=60,
## Casualties_Class=Small} => {Urban_or_Rural_Area=Rural} 0.002984520 0.9766355 0.003055920 2.851085 418
itemFrequencyPlot(items(r_subset), topN=10, cex.names=.5)
plot(head(sort(r_subset, by="lift"), 10),
method="graph", control=list(cex=.7))
*Note: We may find that {Accident_Severity=Fatal, Speed_limit=60, Light_Conditions=Daylight} => {Urban_or_Rural_Area=Rural} Co-occur together the lift is not very high, but is bigger than one what may suggest that they are positive co-related.
r_subset <- subset(rules, subset = items %in% "Light_Conditions=Darkness - no lighting")
## Inspect
inspect(head(sort(r_subset, by="lift"), n=10))
## lhs rhs support confidence coverage lift count
## [1] {Speed_limit=60,
## Month=Oct,
## Day_Period=Evening} => {Light_Conditions=Darkness - no lighting} 0.001599360 0.8296296 0.001927800 16.00918 224
## [2] {Speed_limit=60,
## Day_Period=Evening,
## Season=Fall} => {Light_Conditions=Darkness - no lighting} 0.004184041 0.8060523 0.005190781 15.55421 586
## [3] {Light_Conditions=Darkness - no lighting,
## Month=Dec,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.001563660 0.9201681 0.001699320 13.38823 219
## [4] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Season=Winter} => {Speed_limit=70} 0.003420061 0.9037736 0.003784201 13.14969 479
## [5] {Light_Conditions=Darkness - no lighting,
## Day_Period=Night,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.002941680 0.9035088 0.003255841 13.14584 412
## [6] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Junction_Detail=Not at junction or within 20 metres} => {Speed_limit=70} 0.008418061 0.9013761 0.009339121 13.11481 1179
## [7] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Junction_Control=Data missing or out of range} => {Speed_limit=70} 0.008418061 0.9013761 0.009339121 13.11481 1179
## [8] {Accident_Severity=Slight,
## Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.007475581 0.9010327 0.008296681 13.10981 1047
## [9] {Light_Conditions=Darkness - no lighting,
## Day_Period=Afternoon,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.002606100 0.8990148 0.002898840 13.08045 365
## [10] {Light_Conditions=Darkness - no lighting,
## Month=Jan,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.001549380 0.8966942 0.001727880 13.04669 217
itemFrequencyPlot(items(r_subset), topN=10, cex.names=0.5)
plot(head(sort(r_subset, by="lift"), 10),
method="graph", control=list(cex=.7))
*Note: From an quick analisys, Speed_limit=60, Month=Nov, Day_Period=Evening} => {Light_Conditions=Darkness - no lighting} is a coerent rule but not very usefull.
##Subset by Road Surface conditions
r_subset <- subset(rules, subset = items %in% "Road_Surface_Conditions=Wet or damp")
# Inspect
inspect(head(sort(r_subset, by="lift"), n=10))
## lhs rhs support confidence coverage lift count
## [1] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Road_Surface_Conditions=Wet or damp} => {Speed_limit=70} 0.004333981 0.8861314 0.004890901 12.893000 607
## [2] {Road_Type=Roundabout,
## Road_Surface_Conditions=Wet or damp,
## Junction_Control=Auto traffic signal} => {Junction_Detail=Roundabout} 0.001535100 0.9862385 0.001556520 11.118782 215
## [3] {Speed_limit=40,
## Road_Type=Roundabout,
## Road_Surface_Conditions=Wet or damp} => {Junction_Detail=Roundabout} 0.002763180 0.9748111 0.002834580 10.989949 387
## [4] {Speed_limit=60,
## Road_Type=Roundabout,
## Road_Surface_Conditions=Wet or damp} => {Junction_Detail=Roundabout} 0.001906380 0.9709091 0.001963500 10.945959 267
## [5] {Urban_or_Rural_Area=Rural,
## Road_Type=Roundabout,
## Road_Surface_Conditions=Wet or damp} => {Junction_Detail=Roundabout} 0.006097561 0.9467849 0.006440281 10.673984 854
## [6] {Road_Type=Roundabout,
## Road_Surface_Conditions=Wet or damp,
## Season=Summer} => {Junction_Detail=Roundabout} 0.002984520 0.9186813 0.003248701 10.357146 418
## [7] {Day_Period=Night,
## Road_Type=Roundabout,
## Road_Surface_Conditions=Wet or damp} => {Junction_Detail=Roundabout} 0.001842120 0.9148936 0.002013480 10.314444 258
## [8] {Month=Jan,
## Road_Type=Roundabout,
## Road_Surface_Conditions=Wet or damp} => {Junction_Detail=Roundabout} 0.002370480 0.8997290 0.002634660 10.143479 332
## [9] {Road_Type=Roundabout,
## Road_Surface_Conditions=Wet or damp,
## Weather_Conditions=Fine no high winds} => {Junction_Detail=Roundabout} 0.006561661 0.8896418 0.007375621 10.029757 919
## [10] {Day_Period=Afternoon,
## Road_Type=Roundabout,
## Road_Surface_Conditions=Wet or damp} => {Junction_Detail=Roundabout} 0.005776261 0.8860898 0.006518821 9.989712 809
itemFrequencyPlot(items(r_subset), topN=10, cex.names=0.5) #(Too big to plot)
plot(head(sort(r_subset, by="lift"), 10),
method="graph", control=list(cex=.7))
*Note: We may find that Light_Conditions=Darkness - no lighting, Road_Type=Dual carriageway, Road_Surface_Conditions=Wet or damp} => {Speed_limit=70} have an lift value > 1 suggesting that they co-occur together (of course wet conditions and Darkness leads to more accidents).
r_subset <- subset(rules, subset = items %in% "Road_Type=Dual carriageway")
# Inspect
inspect(head(sort(r_subset, by="lift"), n=10))
## lhs rhs support confidence coverage lift count
## [1] {Light_Conditions=Darkness - no lighting,
## Month=Dec,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.001563660 0.9201681 0.001699320 13.38823 219
## [2] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Season=Winter} => {Speed_limit=70} 0.003420061 0.9037736 0.003784201 13.14969 479
## [3] {Light_Conditions=Darkness - no lighting,
## Day_Period=Night,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.002941680 0.9035088 0.003255841 13.14584 412
## [4] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Junction_Detail=Not at junction or within 20 metres} => {Speed_limit=70} 0.008418061 0.9013761 0.009339121 13.11481 1179
## [5] {Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway,
## Junction_Control=Data missing or out of range} => {Speed_limit=70} 0.008418061 0.9013761 0.009339121 13.11481 1179
## [6] {Accident_Severity=Slight,
## Light_Conditions=Darkness - no lighting,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.007475581 0.9010327 0.008296681 13.10981 1047
## [7] {Light_Conditions=Darkness - no lighting,
## Day_Period=Afternoon,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.002606100 0.8990148 0.002898840 13.08045 365
## [8] {Light_Conditions=Darkness - no lighting,
## Month=Jan,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.001549380 0.8966942 0.001727880 13.04669 217
## [9] {Light_Conditions=Darkness - no lighting,
## Urban_or_Rural_Area=Rural,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.008717941 0.8931968 0.009760382 12.99580 1221
## [10] {Light_Conditions=Darkness - no lighting,
## Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,
## Road_Type=Dual carriageway} => {Speed_limit=70} 0.009446222 0.8921106 0.010588622 12.98000 1323
itemFrequencyPlot(items(r_subset), topN=10, cex.names=0.5) #(Too big to plot)
plot(head(sort(r_subset, by="lift"), 10),
method="graph", control=list(cex=.7))
*Note: We may find the rule {Light_Conditions=Darkness - no lighting, Road_Type=Dual carriageway, Casualties_Class=Medium} => {Speed_limit=70} contains a co-occurence of high number of car involved. Clearely a auto -way.
rules_convi <- cbind(as(rules, "data.frame"),
conviction=interestMeasure(rules,
"conviction", trans))
# Order by descreasing value of conviction
rules_convi <- rules_convi[order(rules_convi$conviction),]
# Get only the top ones%
per <-0.0005 # define percentage
top_rules <- head(rules_convi[order(rules_convi$conviction, decreasing = T),], per*nrow(rules_convi))
# Show then (they are ordered by conviction)
(top_rules$rules)
## [1] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [2] {Accident_Severity=Slight,Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [3] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [4] {Speed_limit=30,Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [5] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}
## [6] {Accident_Severity=Slight,Speed_limit=30,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [7] {Accident_Severity=Slight,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [8] {Accident_Severity=Slight,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}
## [9] {Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [10] {Accident_Severity=Slight,Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [11] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}
## [12] {Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}
## [13] {Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [14] {Day_Period=Afternoon,Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [15] {Light_Conditions=Daylight,Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [16] {Accident_Severity=Slight,Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}
## [17] {Day_Period=Morning,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [18] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Road_Surface_Conditions=Dry,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [19] {Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}
## [20] {Light_Conditions=Daylight,Day_Period=Morning,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [21] {Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}
## [22] {Accident_Severity=Slight,Light_Conditions=Daylight,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [23] {Day_Period=Morning,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}
## [24] {Accident_Severity=Slight,Road_Surface_Conditions=Dry,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [25] {Light_Conditions=Daylight,Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [26] {Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}
## [27] {Road_Type=Single carriageway,Road_Surface_Conditions=Dry,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [28] {Day_Period=Morning,Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}
## [29] {Speed_limit=30,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [30] {Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}
## [31] {Light_Conditions=Daylight,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [32] {Speed_limit=30,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}
## [33] {Day_Period=Morning,Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [34] {Road_Surface_Conditions=Dry,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [35] {Speed_limit=30,Day_Period=Afternoon,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}
## [36] {Light_Conditions=Daylight,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}
## [37] {Road_Surface_Conditions=Dry,Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}
## 75935 Levels: {} => {Accident_Severity=Slight} ... {Weather_Conditions=Unknown} => {Casualties_Class=Small}
*Note: Of course the most commun ones appears first with strong conviction. However we are insterested in a more grain set of groups of itemsets and rules, namelly those that contain Fatal accidents as consequent.
# From our rules, subset then
rules_subset <- subset(rules, subset=(rhs %in% c("Accident_Severity=Fatal")))
# Check if any rules were generated
summary(rules_subset)
## set of 0 rules
*Note: The current parameters values were not able to generate the items that correspond to the fatal accidents as consequent. (Rare Cases)
# In order to save our memory we gonna subset the transactions that contain fatal ones
trans_sub <- subset(trans, items %in% "Accident_Severity=Fatal")
## Set new parameters
(1/nrow(trans_sub))
## [1] 0.0006188119
min_supp <- 50/nrow(trans_sub)
min_conf <- 0.00025
# Generate the rules with consequent as Accident_Severity=Fatal"
rules_subset <- apriori(trans_sub, parameter = list(supp=min_supp, conf=min_conf,
target ="rules"),
appearance = list(rhs=c("Accident_Severity=Fatal")))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.00025 0.1 1 none FALSE TRUE 5 0.03094059 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 50
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[79 item(s), 1616 transaction(s)] done [0.00s].
## sorting and recoding items ... [49 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10 done [0.06s].
## writing ... [21346 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
summary(rules_subset)
## set of 21346 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6 7 8 9 10
## 1 48 530 2209 4709 5750 4546 2440 897 216
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.097 7.000 10.000
##
## summary of quality measures:
## support confidence coverage lift count
## Min. :0.03094 Min. :1 Min. :0.03094 Min. :1 Min. : 50.0
## 1st Qu.:0.03651 1st Qu.:1 1st Qu.:0.03651 1st Qu.:1 1st Qu.: 59.0
## Median :0.04703 Median :1 Median :0.04703 Median :1 Median : 76.0
## Mean :0.06513 Mean :1 Mean :0.06513 Mean :1 Mean : 105.2
## 3rd Qu.:0.07054 3rd Qu.:1 3rd Qu.:0.07054 3rd Qu.:1 3rd Qu.: 114.0
## Max. :1.00000 Max. :1 Max. :1.00000 Max. :1 Max. :1616.0
##
## mining info:
## data ntransactions support confidence
## trans_sub 1616 0.03094059 0.00025
inspect(head(sort(subset(rules_subset, subset= rhs %pin% "Accident_Severity=Fatal")), by="lift"), n=5)
## lhs rhs support confidence coverage lift count
## [1] {} => {Accident_Severity=Fatal} 1.0000000 1 1.0000000 1 1616
## [2] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres} => {Accident_Severity=Fatal} 0.8620050 1 0.8620050 1 1393
## [3] {Weather_Conditions=Fine no high winds} => {Accident_Severity=Fatal} 0.8199257 1 0.8199257 1 1325
## [4] {Casualties_Class=Small} => {Accident_Severity=Fatal} 0.8193069 1 0.8193069 1 1324
## [5] {Road_Type=Single carriageway} => {Accident_Severity=Fatal} 0.7493812 1 0.7493812 1 1211
## [6] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,
## Weather_Conditions=Fine no high winds} => {Accident_Severity=Fatal} 0.7054455 1 0.7054455 1 1140
itemFrequencyPlot(items(rules_subset), topN=10, cex.names=0.5) #(Too big to plot)
plot(head(sort(rules_subset, by="lift"), 10),
method="graph", control=list(cex=.7))
*Note: The diagram may suggest that the fatal accidents happens with no particular positive correlations (lift =1 -> Independent).
# In order to save our memory we gonna subset the transactions that Casualties_Class=Large
trans_sub <- subset(trans, items %in% "Casualties_Class=Large")
# Set new parameters
(500/nrow(trans_sub))
## [1] 0.3810976
min_supp <- 500/nrow(trans_sub)
min_conf <- 0.00025
# Generate the rules with consequent as Accident_Severity=Fatal"
rules_subset <- apriori(trans_sub, parameter = list(supp=min_supp, conf=min_conf,
target ="rules"),
appearance = list(rhs=c("Casualties_Class=Large")))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.00025 0.1 1 none FALSE TRUE 5 0.3810976 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 500
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[77 item(s), 1312 transaction(s)] done [0.00s].
## sorting and recoding items ... [13 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [65 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules_subset)
## set of 65 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5
## 1 12 26 22 4
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 3.000 3.246 4.000 5.000
##
## summary of quality measures:
## support confidence coverage lift count
## Min. :0.3819 Min. :1 Min. :0.3819 Min. :1 Min. : 501
## 1st Qu.:0.4200 1st Qu.:1 1st Qu.:0.4200 1st Qu.:1 1st Qu.: 551
## Median :0.4794 Median :1 Median :0.4794 Median :1 Median : 629
## Mean :0.5092 Mean :1 Mean :0.5092 Mean :1 Mean : 668
## 3rd Qu.:0.5610 3rd Qu.:1 3rd Qu.:0.5610 3rd Qu.:1 3rd Qu.: 736
## Max. :1.0000 Max. :1 Max. :1.0000 Max. :1 Max. :1312
##
## mining info:
## data ntransactions support confidence
## trans_sub 1312 0.3810976 0.00025
inspect(head(sort(subset(rules_subset, subset= rhs %pin% "Casualties_Class=Large")), by="lift"), n=5)
## lhs rhs support confidence coverage lift count
## [1] {} => {Casualties_Class=Large} 1.0000000 1 1.0000000 1 1312
## [2] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres} => {Casualties_Class=Large} 0.8605183 1 0.8605183 1 1129
## [3] {Weather_Conditions=Fine no high winds} => {Casualties_Class=Large} 0.8170732 1 0.8170732 1 1072
## [4] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,
## Weather_Conditions=Fine no high winds} => {Casualties_Class=Large} 0.7065549 1 0.7065549 1 927
## [5] {Light_Conditions=Daylight} => {Casualties_Class=Large} 0.6996951 1 0.6996951 1 918
## [6] {Road_Type=Single carriageway} => {Casualties_Class=Large} 0.6928354 1 0.6928354 1 909
*Note: Clearely an rule regarding a high speed motorway
# Quick copy to enable reuse of code
subset <- subset_entropy
# Remove so save memory
rm(subset_entropy)
# Convert to transactions
trans <- as(subset, "transactions")
summary(trans)
(500/nrow(trans))
min_supp <- 0.0036
min_conf <- 0.5
min_lift <- 0.6
min_len <- 2
max_len <- 4
# Generate frequent itemsets
itemsets <- apriori(trans, parameter = list(target = "frequent",
supp=min_supp, minlen = min_len, maxlen =max_len))
inspect(head(sort(itemsets), n=10))
quality(itemsets)$lift <- interestMeasure(itemsets, measure="lift", trans = trans)
# Lets analyse
inspect(head(sort(itemsets, by = "lift"), n=10))
trans_sub <- subset(trans, items %in% "Accident_Severity=Fatal")
## Generate the itemsets
itemsets_subset <- apriori(trans_sub, parameter = list(target = "frequent",
supp=min_supp, minlen = min_len, maxlen=max_len))
inspect(head(sort(itemsets_subset), n=10))
*Note: Police always attend fatal accidents, norma
Generate the rules with consequent as Accident_Severity=Fatal"
rules_subset <- apriori(trans_sub, parameter = list(supp=min_supp, conf=min_conf,
target ="rules"),
appearance = list(rhs=c("Accident_Severity=Fatal")))
summary(rules_subset)
inspect(head(sort(subset(rules_subset, subset= rhs %pin% "Accident_Severity=Fatal")), by="lift"), n=5)
*Note: We see that the almost the same results were attained. Police always attend fata crashes {Junction_Detail=Not at junction or within 20 metres} => {Accident_Severity=Fatal} Ok no infraestructure or juctions.
# Staring with our last dataframe preprocessed
df_model <- df_subset # make shallow copy
# Show number inconplete cases
(apply(is.na(df_model), 2, sum))
## Location_Easting_OSGR
## 27
## Location_Northing_OSGR
## 27
## Longitude
## 27
## Latitude
## 27
## Police_Force
## 0
## Accident_Severity
## 0
## Number_of_Vehicles
## 0
## Number_of_Casualties
## 0
## Date
## 0
## Day_of_Week
## 0
## Time
## 0
## Local_Authority_District
## 0
## First_Road_Class
## 0
## First_Road_Number
## 0
## Road_Type
## 0
## Speed_limit
## 0
## Junction_Detail
## 0
## Junction_Control
## 0
## Second_Road_Class
## 56603
## Second_Road_Number
## 0
## Pedestrian_Crossing-Human_Control
## 0
## Pedestrian_Crossing-Physical_Facilities
## 0
## Light_Conditions
## 0
## Weather_Conditions
## 0
## Road_Surface_Conditions
## 0
## Special_Conditions_at_Site
## 0
## Carriageway_Hazards
## 0
## Urban_or_Rural_Area
## 0
## Did_Police_Officer_Attend_Scene_of_Accident
## 8
## LSOA_of_Accident_Location
## 0
## Season
## 0
## Month
## 0
## Hour
## 18
## Day_Period
## 18
## Casualties_Class
## 0
## Number_Vehicles_Class
## 0
# Show number of entries
(nrow(df_model))
## [1] 140056
# Drop datasframes
#rm(df_subset)
#rm(df_copy)
*Note: Second_Road_Class has to many incomplete cases, just remove it, and locations esting and lsoa accident
# Lets drop initialy the time to see things,
df_model <- df_model[, -c('Date', 'Time', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
'LSOA_of_Accident_Location','Second_Road_Class', 'First_Road_Number', 'Second_Road_Number', 'Accident_Severity', 'Day_of_Week')]
# We have enough examples for modeling, just remove incomplete entries (only few of then)
df_model <- na.omit(df_model)
# Show number incomplete cases
(apply(is.na(df_model), 2, sum))
## Longitude
## 0
## Latitude
## 0
## Police_Force
## 0
## Number_of_Vehicles
## 0
## Number_of_Casualties
## 0
## Local_Authority_District
## 0
## First_Road_Class
## 0
## Road_Type
## 0
## Speed_limit
## 0
## Junction_Detail
## 0
## Junction_Control
## 0
## Pedestrian_Crossing-Human_Control
## 0
## Pedestrian_Crossing-Physical_Facilities
## 0
## Light_Conditions
## 0
## Weather_Conditions
## 0
## Road_Surface_Conditions
## 0
## Special_Conditions_at_Site
## 0
## Carriageway_Hazards
## 0
## Urban_or_Rural_Area
## 0
## Did_Police_Officer_Attend_Scene_of_Accident
## 0
## Season
## 0
## Month
## 0
## Hour
## 0
## Day_Period
## 0
## Casualties_Class
## 0
## Number_Vehicles_Class
## 0
# Show number of entries
(nrow(df_model))
## [1] 140003
# Show number of levels each column (Levels columns only)
(apply( df_model, 2, function(x) length( unique(x) )))
## Longitude
## 133293
## Latitude
## 106277
## Police_Force
## 51
## Number_of_Vehicles
## 16
## Number_of_Casualties
## 22
## Local_Authority_District
## 380
## First_Road_Class
## 6
## Road_Type
## 6
## Speed_limit
## 8
## Junction_Detail
## 10
## Junction_Control
## 5
## Pedestrian_Crossing-Human_Control
## 4
## Pedestrian_Crossing-Physical_Facilities
## 7
## Light_Conditions
## 5
## Weather_Conditions
## 9
## Road_Surface_Conditions
## 6
## Special_Conditions_at_Site
## 9
## Carriageway_Hazards
## 7
## Urban_or_Rural_Area
## 2
## Did_Police_Officer_Attend_Scene_of_Accident
## 3
## Season
## 4
## Month
## 12
## Hour
## 24
## Day_Period
## 4
## Casualties_Class
## 4
## Number_Vehicles_Class
## 3
# Quick view on data
(head(df_model))
## Longitude Latitude Police_Force Number_of_Vehicles
## 1: -0.198465 51.50554 Metropolitan Police 1
## 2: -0.178838 51.49184 Metropolitan Police 1
## 3: -0.205590 51.51491 Metropolitan Police 1
## 4: -0.208327 51.51495 Metropolitan Police 1
## 5: -0.206022 51.49657 Metropolitan Police 2
## 6: -0.193610 51.50079 Metropolitan Police 2
## Number_of_Casualties Local_Authority_District First_Road_Class
## 1: 1 Kensington and Chelsea C
## 2: 1 Kensington and Chelsea Unclassified
## 3: 1 Kensington and Chelsea B
## 4: 1 Kensington and Chelsea B
## 5: 1 Kensington and Chelsea A
## 6: 1 Kensington and Chelsea A
## Road_Type Speed_limit Junction_Detail
## 1: Single carriageway 30 T or staggered junction
## 2: Single carriageway 30 T or staggered junction
## 3: Single carriageway 30 Mini-roundabout
## 4: Single carriageway 30 Crossroads
## 5: Single carriageway 30 Crossroads
## 6: Single carriageway 30 T or staggered junction
## Junction_Control Pedestrian_Crossing-Human_Control
## 1: Give way or uncontrolled None within 50 metres
## 2: Give way or uncontrolled None within 50 metres
## 3: Give way or uncontrolled None within 50 metres
## 4: Give way or uncontrolled None within 50 metres
## 5: Auto traffic signal None within 50 metres
## 6: Give way or uncontrolled None within 50 metres
## Pedestrian_Crossing-Physical_Facilities
## 1: No physical crossing facilities within 50 metres
## 2: No physical crossing facilities within 50 metres
## 3: Zebra
## 4: No physical crossing facilities within 50 metres
## 5: Pedestrian phase at traffic signal junction
## 6: Pelican, puffin, toucan or similar non-junction pedestrian light crossing
## Light_Conditions Weather_Conditions Road_Surface_Conditions
## 1: Darkness - lights lit Fine no high winds Dry
## 2: Daylight Fine no high winds Dry
## 3: Darkness - lights lit Raining no high winds Wet or damp
## 4: Daylight Fine no high winds Wet or damp
## 5: Daylight Raining no high winds Wet or damp
## 6: Daylight Fine no high winds Wet or damp
## Special_Conditions_at_Site Carriageway_Hazards Urban_or_Rural_Area
## 1: None None Urban
## 2: None None Urban
## 3: None None Urban
## 4: None None Urban
## 5: None None Urban
## 6: None None Urban
## Did_Police_Officer_Attend_Scene_of_Accident Season Month Hour Day_Period
## 1: Yes Winter Jan 18 Afternoon
## 2: Yes Winter Jan 7 Morning
## 3: Yes Winter Jan 18 Afternoon
## 4: No Winter Jan 7 Morning
## 5: No Winter Jan 7 Morning
## 6: Yes Winter Jan 9 Morning
## Casualties_Class Number_Vehicles_Class
## 1: Small Small
## 2: Small Small
## 3: Small Small
## 4: Small Small
## 5: Small Small
## 6: Small Small
# Quick vies on column type see if all its ok
(str(df_model))
## Classes 'data.table' and 'data.frame': 140003 obs. of 26 variables:
## $ Longitude : num -0.198 -0.179 -0.206 -0.208 -0.206 ...
## $ Latitude : num 51.5 51.5 51.5 51.5 51.5 ...
## $ Police_Force : chr "Metropolitan Police" "Metropolitan Police" "Metropolitan Police" "Metropolitan Police" ...
## $ Number_of_Vehicles : int 1 1 1 1 2 2 2 2 2 2 ...
## $ Number_of_Casualties : int 1 1 1 1 1 1 1 1 1 2 ...
## $ Local_Authority_District : chr "Kensington and Chelsea" "Kensington and Chelsea" "Kensington and Chelsea" "Kensington and Chelsea" ...
## $ First_Road_Class : chr "C" "Unclassified" "B" "B" ...
## $ Road_Type : chr "Single carriageway" "Single carriageway" "Single carriageway" "Single carriageway" ...
## $ Speed_limit : Factor w/ 8 levels "0","10","20",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Junction_Detail : chr "T or staggered junction" "T or staggered junction" "Mini-roundabout" "Crossroads" ...
## $ Junction_Control : chr "Give way or uncontrolled" "Give way or uncontrolled" "Give way or uncontrolled" "Give way or uncontrolled" ...
## $ Pedestrian_Crossing-Human_Control : chr "None within 50 metres" "None within 50 metres" "None within 50 metres" "None within 50 metres" ...
## $ Pedestrian_Crossing-Physical_Facilities : chr "No physical crossing facilities within 50 metres" "No physical crossing facilities within 50 metres" "Zebra" "No physical crossing facilities within 50 metres" ...
## $ Light_Conditions : chr "Darkness - lights lit" "Daylight" "Darkness - lights lit" "Daylight" ...
## $ Weather_Conditions : chr "Fine no high winds" "Fine no high winds" "Raining no high winds" "Fine no high winds" ...
## $ Road_Surface_Conditions : chr "Dry" "Dry" "Wet or damp" "Wet or damp" ...
## $ Special_Conditions_at_Site : chr "None" "None" "None" "None" ...
## $ Carriageway_Hazards : chr "None" "None" "None" "None" ...
## $ Urban_or_Rural_Area : chr "Urban" "Urban" "Urban" "Urban" ...
## $ Did_Police_Officer_Attend_Scene_of_Accident: chr "Yes" "Yes" "Yes" "No" ...
## $ Season : chr "Winter" "Winter" "Winter" "Winter" ...
## $ Month : Factor w/ 12 levels "Apr","Aug","Dec",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Hour : num 18 7 18 7 7 9 9 15 6 15 ...
## $ Day_Period : Factor w/ 4 levels "Night","Morning",..: 3 2 3 2 2 2 2 3 1 3 ...
## $ Casualties_Class : Factor w/ 4 levels "Small","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Number_Vehicles_Class : Factor w/ 3 levels "Small","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
## NULL
*Note: perhaps latitude and longitus is to descriminative, lest drop it for now and find a way to summarize this later
# Lets drop initialy the time to see things
df_model <- df_model[, -c('Latitude', 'Longitude')]
# Show number of levels each column (Levels columns only)
(apply( df_model, 2, function(x) length( unique(x) )))
## Police_Force
## 51
## Number_of_Vehicles
## 16
## Number_of_Casualties
## 22
## Local_Authority_District
## 380
## First_Road_Class
## 6
## Road_Type
## 6
## Speed_limit
## 8
## Junction_Detail
## 10
## Junction_Control
## 5
## Pedestrian_Crossing-Human_Control
## 4
## Pedestrian_Crossing-Physical_Facilities
## 7
## Light_Conditions
## 5
## Weather_Conditions
## 9
## Road_Surface_Conditions
## 6
## Special_Conditions_at_Site
## 9
## Carriageway_Hazards
## 7
## Urban_or_Rural_Area
## 2
## Did_Police_Officer_Attend_Scene_of_Accident
## 3
## Season
## 4
## Month
## 12
## Hour
## 24
## Day_Period
## 4
## Casualties_Class
## 4
## Number_Vehicles_Class
## 3
# Debug
(nrow(df_model))
## [1] 140003
# Convert chars to factors in this reduced space
#df_model <- mutate_if(df_model, is.character, as.factor)
# Wuick pick
(nrow(df_model))
## [1] 140003
# Show number incomplete cases
(apply(is.na(df_model), 2, sum))
## Police_Force
## 0
## Number_of_Vehicles
## 0
## Number_of_Casualties
## 0
## Local_Authority_District
## 0
## First_Road_Class
## 0
## Road_Type
## 0
## Speed_limit
## 0
## Junction_Detail
## 0
## Junction_Control
## 0
## Pedestrian_Crossing-Human_Control
## 0
## Pedestrian_Crossing-Physical_Facilities
## 0
## Light_Conditions
## 0
## Weather_Conditions
## 0
## Road_Surface_Conditions
## 0
## Special_Conditions_at_Site
## 0
## Carriageway_Hazards
## 0
## Urban_or_Rural_Area
## 0
## Did_Police_Officer_Attend_Scene_of_Accident
## 0
## Season
## 0
## Month
## 0
## Hour
## 0
## Day_Period
## 0
## Casualties_Class
## 0
## Number_Vehicles_Class
## 0
#rm(df_copy)
rm(map)
# convert to upper case coliumns, some models dont handle this well
for( i in colnames(df_model)){
colnames(df_model)[which(colnames(df_model)==i)] = toupper(i)
}
# Ok, there is here a column messi, lest remove the -
setnames( df_model , "PEDESTRIAN_CROSSING-HUMAN_CONTROL" , "PEDESTRIAN_CROSSING_HUMAN_CONTROL" )
setnames( df_model , "PEDESTRIAN_CROSSING-PHYSICAL_FACILITIES" , "PEDESTRIAN_CROSSING_PHYSICAL_FACILITIES")
# Ensure thet they are factors
library(dplyr)
df_model=df_model %>% mutate_if(is.character, as.factor)
# This column has 53 categoriees, lest drop them
df_model <- df_model[, -c('LOCAL_AUTHORITY_DISTRICT')]
#Create an small sample (2000 cases)
sample_perc <- 0.05
# Extract samples for hold out method
set.seed(1234) # for reproduction
idx_sample <-sample(1:nrow(df_model),as.integer(sample_perc*nrow(df_model)))
length(idx_sample)
## [1] 7000
df_model <- df_model[idx_sample, ]
# Set the threshold
split_val <- 0.7
# Extract samples for hold out method
set.seed(1234) # for reproduction
idx.tr <-sample(1:nrow(df_model),as.integer(split_val*nrow(df_model)))
tr <- df_model[idx.tr,]
ts <- df_model[-idx.tr,]
ts = rbind(tr[1,],ts)
ts = ts[-1,]
# We should ensure a stratifyed sample
#library(caret)
#idx.tr <- createDataPartition(df_model, p = split_val, list = FALSE)
#tr <- df_model[idx.tr,]
#ts <- df_model[-idx.tr,]
# Debug
(str(tr))
## Classes 'data.table' and 'data.frame': 4900 obs. of 23 variables:
## $ POLICE_FORCE : Factor w/ 51 levels "Avon and Somerset",..: 31 28 30 15 26 50 47 10 24 3 ...
## $ NUMBER_OF_VEHICLES : int 1 2 1 2 2 1 1 2 2 2 ...
## $ NUMBER_OF_CASUALTIES : int 1 1 1 1 1 2 1 2 1 1 ...
## $ FIRST_ROAD_CLASS : Factor w/ 6 levels "A","A(M)","B",..: 4 1 6 3 1 1 6 3 1 3 ...
## $ ROAD_TYPE : Factor w/ 6 levels "Dual carriageway",..: 4 4 4 4 4 1 4 4 1 4 ...
## $ SPEED_LIMIT : Factor w/ 8 levels "0","10","20",..: 7 6 4 7 4 5 4 7 5 7 ...
## $ JUNCTION_DETAIL : Factor w/ 10 levels "Crossroads","Data missing or out of range",..: 5 10 10 10 8 5 5 10 1 5 ...
## $ JUNCTION_CONTROL : Factor w/ 5 levels "Authorised person",..: 3 4 4 4 2 3 3 4 4 3 ...
## $ PEDESTRIAN_CROSSING_HUMAN_CONTROL : Factor w/ 4 levels "Control by other authorised person",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ PEDESTRIAN_CROSSING_PHYSICAL_FACILITIES : Factor w/ 7 levels "Central refuge",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ LIGHT_CONDITIONS : Factor w/ 5 levels "Darkness - lighting unknown",..: 5 2 5 5 5 5 2 5 2 4 ...
## $ WEATHER_CONDITIONS : Factor w/ 9 levels "Fine + high winds",..: 2 2 2 2 2 2 5 2 2 6 ...
## $ ROAD_SURFACE_CONDITIONS : Factor w/ 6 levels "Data missing or out of range",..: 2 2 2 2 2 2 6 2 2 6 ...
## $ SPECIAL_CONDITIONS_AT_SITE : Factor w/ 9 levels "Auto signal part defective",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ CARRIAGEWAY_HAZARDS : Factor w/ 7 levels "Any animal in carriageway (except ridden horse)",..: 3 3 3 3 3 3 1 3 3 3 ...
## $ URBAN_OR_RURAL_AREA : Factor w/ 2 levels "Rural","Urban": 1 1 2 1 1 2 1 1 1 1 ...
## $ DID_POLICE_OFFICER_ATTEND_SCENE_OF_ACCIDENT: Factor w/ 3 levels "No","No - accident was reported using a self completion form (self rep only)",..: 3 3 1 3 1 3 3 3 3 3 ...
## $ SEASON : Factor w/ 4 levels "Fall","Spring",..: 2 3 4 4 4 3 1 4 1 1 ...
## $ MONTH : Factor w/ 12 levels "Apr","Aug","Dec",..: 9 12 5 8 8 7 10 8 12 11 ...
## $ HOUR : num 19 23 13 8 12 15 19 16 21 6 ...
## $ DAY_PERIOD : Factor w/ 4 levels "Night","Morning",..: 4 4 3 2 2 3 4 3 4 1 ...
## $ CASUALTIES_CLASS : Factor w/ 4 levels "Small","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ NUMBER_VEHICLES_CLASS : Factor w/ 3 levels "Small","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
## NULL
(nrow(tr))
## [1] 4900
(nrow(ts))
## [1] 2100
# Make several estimations and select the best model
res_TREES <- performanceEstimation(
PredTask(NUMBER_OF_CASUALTIES~ .,tr),
workflowVariants(learner="rpartXse",learner.pars=list(se=c(0,0.5,1, 2, 3))),
EstimationTask(metrics="mse",method=Holdout(nReps=3,hldSz=0.3)))
# Save the model results
save(res_TREES, file = "res_TREES1.RData")
# Load the data
load(file = "res_TREES1.RData")
# Plot the graphs
plot(res_TREES)
# See the performance rank
topPerformers(res_TREES)
## $tr.NUMBER_OF_CASUALTIES
## Workflow Estimate
## mse rpartXse.v1 0.243
# Get the parameters
(getWorkflow("rpartXse.v2",res_TREES))
## Workflow Object:
## Workflow ID :: rpartXse.v2
## Workflow Function :: standardWF
## Parameter values:
## learner.pars -> se=0.5
## learner -> rpartXse
require('randomForest')
#df_model1 <- df_model[, -c('LOCAL_AUTHORITY_DISTRICT')]
# Make several estimations and select the best model
res_RF <- performanceEstimation(
PredTask(NUMBER_OF_CASUALTIES ~ .,tr),
workflowVariants(learner="randomForest", learner.pars=list(ntree= c(100, 500, 1000))),
EstimationTask(metrics="mse",method=Holdout(nReps=3,hldSz=0.3)))
# Save the model results
save(res_RF, file = "res_RF.RData")
# Load the data
load(file = "res_RF.RData")
# Plot the graphs
plot(res_RF)
# See the performance rank
topPerformers(res_RF)
## $tr.NUMBER_OF_CASUALTIES
## Workflow Estimate
## mse randomForest.v3 0.264
# Make several estimations and select the best model
res_SVM <- performanceEstimation(
PredTask(NUMBER_OF_CASUALTIES ~ .,tr),
workflowVariants(learner="svm", learner.pars=list(kernel=c("linear", "radial"), cost=c(1,3), gamma=c(0.1,0.01), epsilon=0.1)),
EstimationTask(metrics="mse",method=Holdout(nReps=3,hldSz=0.3)))
# Save the model results
save(res_SVM, file = "res_SVM.RData")
# Load the data
load(file = "res_SVM.RData")
# Plot the graphs
plot(res_SVM)
# See the performance rank
topPerformers(res_SVM)
## $tr.NUMBER_OF_CASUALTIES
## Workflow Estimate
## mse svm.v3 0.254
# Make several estimations and select the best model
res_EXP <- performanceEstimation(
PredTask(NUMBER_OF_CASUALTIES ~ ., tr),
c( Workflow(learner="naiveBayes"),
workflowVariants(learner="svm", learner.pars=list(kernel=c("linear", "radial"), cost=c(1,3))),
Workflow(learner="randomForest", learner.pars=list( ntree=3000))
),
EstimationTask(metrics="mse",method=Holdout(nReps=3,hldSz=0.3)))
# Save the model results
save(res_EXP, file = "res_EXP.RData")
# Load the data
load(file = "res_EXP.RData")
# Plot the graphs
plot(res_EXP)
# See the performance rank
topPerformers(res_EXP)
## $tr.NUMBER_OF_CASUALTIES
## Workflow Estimate
## mse svm.v3 0.254
res_MODELS <- performanceEstimation(
PredTask(NUMBER_OF_CASUALTIES ~ ., tr),
workflowVariants("standardWF",
learner=c("rpartXse","svm","randomForest")),
EstimationTask(metrics="mse",method=Holdout(nReps=3,hldSz=0.3)))
# Save the model results
save(res_MODELS, file = "res_MODELS.RData")
# Load the data
load(file = "res_MODELS.RData")
# Plot the graphs
plot(res_MODELS)
# See the performance rank
topPerformers(res_MODELS)
## $tr.NUMBER_OF_CASUALTIES
## Workflow Estimate
## mse rpartXse 0.259
require('cluster')
require(cluster)
require(dplyr)
require(ggplot2)
require(readr)
require(Rtsne)
# Some intial colum name analisis
(colnames(df_subset))
## [1] "Location_Easting_OSGR"
## [2] "Location_Northing_OSGR"
## [3] "Longitude"
## [4] "Latitude"
## [5] "Police_Force"
## [6] "Accident_Severity"
## [7] "Number_of_Vehicles"
## [8] "Number_of_Casualties"
## [9] "Date"
## [10] "Day_of_Week"
## [11] "Time"
## [12] "Local_Authority_District"
## [13] "First_Road_Class"
## [14] "First_Road_Number"
## [15] "Road_Type"
## [16] "Speed_limit"
## [17] "Junction_Detail"
## [18] "Junction_Control"
## [19] "Second_Road_Class"
## [20] "Second_Road_Number"
## [21] "Pedestrian_Crossing-Human_Control"
## [22] "Pedestrian_Crossing-Physical_Facilities"
## [23] "Light_Conditions"
## [24] "Weather_Conditions"
## [25] "Road_Surface_Conditions"
## [26] "Special_Conditions_at_Site"
## [27] "Carriageway_Hazards"
## [28] "Urban_or_Rural_Area"
## [29] "Did_Police_Officer_Attend_Scene_of_Accident"
## [30] "LSOA_of_Accident_Location"
## [31] "Season"
## [32] "Month"
## [33] "Hour"
## [34] "Day_Period"
## [35] "Casualties_Class"
## [36] "Number_Vehicles_Class"
# Lets drop initialy the time to see things
df_notime <- df_subset[, -c('Date', 'Time', 'Location_Easting_OSGR', 'Location_Northing_OSGR', 'LSOA_of_Accident_Location','Second_Road_Class', 'Number_of_Casualties')]
# Access the number of levels
(apply(df_notime, 2, function(x) length( unique(x) )))
## Longitude
## 133317
## Latitude
## 106293
## Police_Force
## 51
## Accident_Severity
## 3
## Number_of_Vehicles
## 16
## Day_of_Week
## 7
## Local_Authority_District
## 380
## First_Road_Class
## 6
## First_Road_Number
## 4340
## Road_Type
## 6
## Speed_limit
## 8
## Junction_Detail
## 10
## Junction_Control
## 5
## Second_Road_Number
## 3795
## Pedestrian_Crossing-Human_Control
## 4
## Pedestrian_Crossing-Physical_Facilities
## 7
## Light_Conditions
## 5
## Weather_Conditions
## 9
## Road_Surface_Conditions
## 6
## Special_Conditions_at_Site
## 9
## Carriageway_Hazards
## 7
## Urban_or_Rural_Area
## 2
## Did_Police_Officer_Attend_Scene_of_Accident
## 4
## Season
## 4
## Month
## 12
## Hour
## 25
## Day_Period
## 5
## Casualties_Class
## 4
## Number_Vehicles_Class
## 3
# Access the incomplete cases
(apply(is.na(df_notime), 2, sum))
## Longitude
## 27
## Latitude
## 27
## Police_Force
## 0
## Accident_Severity
## 0
## Number_of_Vehicles
## 0
## Day_of_Week
## 0
## Local_Authority_District
## 0
## First_Road_Class
## 0
## First_Road_Number
## 0
## Road_Type
## 0
## Speed_limit
## 0
## Junction_Detail
## 0
## Junction_Control
## 0
## Second_Road_Number
## 0
## Pedestrian_Crossing-Human_Control
## 0
## Pedestrian_Crossing-Physical_Facilities
## 0
## Light_Conditions
## 0
## Weather_Conditions
## 0
## Road_Surface_Conditions
## 0
## Special_Conditions_at_Site
## 0
## Carriageway_Hazards
## 0
## Urban_or_Rural_Area
## 0
## Did_Police_Officer_Attend_Scene_of_Accident
## 8
## Season
## 0
## Month
## 0
## Hour
## 18
## Day_Period
## 18
## Casualties_Class
## 0
## Number_Vehicles_Class
## 0
# Count incomplete cases
sum(apply(df_notime, 1, function(x){any(is.na(x))}))
## [1] 53
# Remove those incomplete cases in small number
df_notime <- na.omit(df_notime)
# Ok, we have memory problems, lest just pick a small subset of rows
split_val <- 0.05
# Extract samples for hold out method
set.seed(1234) # for reproduction
idx.tr <-sample(1:nrow(df_notime),as.integer(split_val*nrow(df_notime)))
df_notime_sample <- df_notime[idx.tr,]
# Convert chars to factors in this reduced space
df_notime_sample <- mutate_if(df_notime_sample, is.character, as.factor)
# Wuick pick
(nrow(df_notime_sample))
## [1] 7000
# Compute Gower distance
gower_dist <- daisy(df_notime_sample, metric = "gower")
gower_mat <- as.matrix(gower_dist)
#' Print most similar clients
(df_notime_sample[which(gower_mat == min(gower_mat[gower_mat != min(gower_mat)]), arr.ind = TRUE)[1, ], ])
## Longitude Latitude Police_Force Accident_Severity Number_of_Vehicles
## 1: 0.285162 51.31869 Kent Slight 3
## 2: 0.504544 51.34127 Kent Slight 3
## Day_of_Week Local_Authority_District First_Road_Class First_Road_Number
## 1: Saturday Tonbridge and Malling Motorway 20
## 2: Saturday Tonbridge and Malling Motorway 2
## Road_Type Speed_limit Junction_Detail
## 1: Dual carriageway 70 Not at junction or within 20 metres
## 2: Dual carriageway 70 Not at junction or within 20 metres
## Junction_Control Second_Road_Number
## 1: Data missing or out of range 0
## 2: Data missing or out of range 0
## Pedestrian_Crossing-Human_Control
## 1: None within 50 metres
## 2: None within 50 metres
## Pedestrian_Crossing-Physical_Facilities Light_Conditions
## 1: No physical crossing facilities within 50 metres Daylight
## 2: No physical crossing facilities within 50 metres Daylight
## Weather_Conditions Road_Surface_Conditions Special_Conditions_at_Site
## 1: Fine no high winds Dry None
## 2: Fine no high winds Dry None
## Carriageway_Hazards Urban_or_Rural_Area
## 1: None Rural
## 2: None Rural
## Did_Police_Officer_Attend_Scene_of_Accident Season Month Hour Day_Period
## 1: Yes Fall Oct 9 Morning
## 2: Yes Fall Oct 9 Morning
## Casualties_Class Number_Vehicles_Class
## 1: Small Medium
## 2: Small Medium
#' Print most dissimilar clients
(df_notime_sample[which(gower_mat == max(gower_mat[gower_mat != max(gower_mat)]), arr.ind = TRUE)[1, ], ])
## Longitude Latitude Police_Force Accident_Severity Number_of_Vehicles
## 1: -1.237951 53.4900 South Yorkshire Serious 1
## 2: -2.192345 52.9077 Staffordshire Slight 4
## Day_of_Week Local_Authority_District First_Road_Class First_Road_Number
## 1: Friday Doncaster A 6023
## 2: Tuesday Stafford Motorway 6
## Road_Type Speed_limit Junction_Detail
## 1: Single carriageway 40 Crossroads
## 2: Dual carriageway 70 Not at junction or within 20 metres
## Junction_Control Second_Road_Number
## 1: Give way or uncontrolled 0
## 2: Data missing or out of range 0
## Pedestrian_Crossing-Human_Control
## 1: None within 50 metres
## 2: Data missing or out of range
## Pedestrian_Crossing-Physical_Facilities Light_Conditions
## 1: No physical crossing facilities within 50 metres Darkness - lights lit
## 2: Data missing or out of range Daylight
## Weather_Conditions Road_Surface_Conditions Special_Conditions_at_Site
## 1: Raining no high winds Wet or damp None
## 2: Fine no high winds Dry Data missing or out of range
## Carriageway_Hazards Urban_or_Rural_Area
## 1: None Urban
## 2: Data missing or out of range Rural
## Did_Police_Officer_Attend_Scene_of_Accident Season Month Hour Day_Period
## 1: Yes Winter Mar 2 Night
## 2: No Summer Sep 16 Afternoon
## Casualties_Class Number_Vehicles_Class
## 1: Small Small
## 2: Small Medium
The PAM algorithm searches for the k representative objects (the medoids) among the cases in the given data set.
As with k-means each observation is allocated to the nearest medoid.
PAM is more robust to the presence of outliers because it uses original objects as centroids instead of averages that may be subject to the effects of outliers
sil_width <- c(NA)
for(i in 2:8){
pam_fit <- pam(gower_dist, diss = TRUE, k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
}
plot(1:8, sil_width,
xlab = "Number of clusters",
ylab = "Silhouette Width")
lines(1:8, sil_width)
k <- 5
pam_fit <- pam(gower_dist, diss = TRUE, k)
pam_results <- df_notime_sample %>%
mutate(cluster = pam_fit$clustering) %>%
group_by(cluster) %>%
do(the_summary = summary(.))
pam_results$the_summary
## [[1]]
## Longitude Latitude Police_Force Accident_Severity
## Min. :-6.0995 Min. :50.07 Thames Valley : 69 Fatal : 34
## 1st Qu.:-2.5907 1st Qu.:51.45 Kent : 65 Serious: 250
## Median :-1.2756 Median :52.34 Surrey : 65 Slight :1154
## Mean :-1.4956 Mean :52.64 Devon and Cornwall: 58
## 3rd Qu.:-0.3068 3rd Qu.:53.41 Sussex : 55
## Max. : 1.7025 Max. :60.31 Hampshire : 50
## (Other) :1076
## Number_of_Vehicles Day_of_Week Local_Authority_District
## Min. :1.000 Friday :198 Cornwall : 21
## 1st Qu.:1.000 Monday :199 Cheshire East : 19
## Median :2.000 Saturday :178 Northumberland: 18
## Mean :1.882 Sunday :174 Doncaster : 16
## 3rd Qu.:2.000 Thursday :201 Peterborough : 16
## Max. :7.000 Tuesday :158 Highland : 14
## Wednesday:330 (Other) :1334
## First_Road_Class First_Road_Number Road_Type Speed_limit
## A :843 Min. : 0.0 Dual carriageway :384 60 :689
## A(M) : 12 1st Qu.: 20.0 One way street : 8 70 :302
## B :176 Median : 143.0 Roundabout : 53 30 :167
## C :102 Mean : 815.5 Single carriageway:959 40 :167
## Motorway :143 3rd Qu.: 648.2 Slip road : 27 50 :107
## Unclassified:162 Max. :9157.0 Unknown : 7 20 : 6
## (Other): 0
## Junction_Detail
## Not at junction or within 20 metres:1171
## Roundabout : 79
## Crossroads : 60
## T or staggered junction : 60
## Private drive or entrance : 29
## Other junction : 20
## (Other) : 19
## Junction_Control Second_Road_Number
## Authorised person : 0 Min. : -1
## Auto traffic signal : 48 1st Qu.: 0
## Data missing or out of range:1173 Median : 0
## Give way or uncontrolled : 209 Mean : 171
## Stop sign : 8 3rd Qu.: 0
## Max. :9310
##
## Pedestrian_Crossing-Human_Control
## Control by other authorised person: 2
## Control by school crossing patrol : 0
## Data missing or out of range : 2
## None within 50 metres :1434
##
##
##
## Pedestrian_Crossing-Physical_Facilities
## Central refuge : 10
## Data missing or out of range : 2
## Footbridge or subway : 5
## No physical crossing facilities within 50 metres :1371
## Pedestrian phase at traffic signal junction : 22
## Pelican, puffin, toucan or similar non-junction pedestrian light crossing: 19
## Zebra : 9
## Light_Conditions Weather_Conditions
## Darkness - lighting unknown: 18 Fine no high winds :1137
## Darkness - lights lit : 100 Raining no high winds: 179
## Darkness - lights unlit : 8 Fine + high winds : 32
## Darkness - no lighting : 223 Raining + high winds : 31
## Daylight :1089 Other : 23
## Unknown : 16
## (Other) : 20
## Road_Surface_Conditions Special_Conditions_at_Site
## Data missing or out of range: 5 None :1364
## Dry :968 Roadworks : 35
## Flood over 3cm. deep : 9 Mud : 13
## Frost or ice : 33 Road surface defective : 13
## Snow : 11 Auto traffic signal - out: 5
## Wet or damp :412 Oil or diesel : 5
## (Other) : 3
## Carriageway_Hazards Urban_or_Rural_Area
## Any animal in carriageway (except ridden horse): 17 Rural:1308
## Data missing or out of range : 1 Urban: 130
## None :1391
## Other object on road : 16
## Pedestrian in carriageway - not injured : 4
## Previous accident : 7
## Vehicle load on road : 2
## Did_Police_Officer_Attend_Scene_of_Accident
## No : 173
## No - accident was reported using a self completion form (self rep only): 3
## Yes :1262
##
##
##
##
## Season Month Hour Day_Period
## Fall :262 Apr :206 Min. : 0.00 Night :123
## Spring:488 Jun :152 1st Qu.:10.00 Morning :378
## Summer:368 Mar :134 Median :15.00 Afternoon:755
## Winter:320 Aug :132 Mean :13.67 Evening :182
## Sep :130 3rd Qu.:17.00
## May :125 Max. :23.00
## (Other):559
## Casualties_Class Number_Vehicles_Class cluster
## Small :1270 Small :1210 Min. :1
## Medium : 137 Medium: 228 1st Qu.:1
## Large : 31 Large : 0 Median :1
## Overweelming: 0 Mean :1
## 3rd Qu.:1
## Max. :1
##
##
## [[2]]
## Longitude Latitude Police_Force Accident_Severity
## Min. :-5.28016 Min. :50.23 Metropolitan Police:358 Fatal : 12
## 1st Qu.:-1.81518 1st Qu.:51.45 Thames Valley : 42 Serious:122
## Median :-0.42135 Median :51.56 Kent : 39 Slight :884
## Mean :-0.98840 Mean :52.18 Sussex : 37
## 3rd Qu.:-0.09676 3rd Qu.:52.72 Hampshire : 33
## Max. : 1.74888 Max. :57.44 West Midlands : 29
## (Other) :480
## Number_of_Vehicles Day_of_Week Local_Authority_District
## Min. :1.000 Friday :144 Westminster : 25
## 1st Qu.:1.000 Monday : 94 Croydon : 23
## Median :2.000 Saturday :140 Tower Hamlets: 22
## Mean :1.788 Sunday :125 Hackney : 20
## 3rd Qu.:2.000 Thursday :137 Lambeth : 20
## Max. :8.000 Tuesday :232 Camden : 18
## Wednesday:146 (Other) :890
## First_Road_Class First_Road_Number Road_Type Speed_limit
## A :628 Min. : 0.00 Dual carriageway :139 30 :814
## A(M) : 3 1st Qu.: 10.25 One way street : 23 40 : 88
## B :112 Median : 231.50 Roundabout :112 70 : 36
## C : 88 Mean :1043.48 Single carriageway:731 50 : 35
## Motorway : 16 3rd Qu.: 976.75 Slip road : 10 60 : 27
## Unclassified:171 Max. :9113.00 Unknown : 3 20 : 18
## (Other): 0
## Junction_Detail
## T or staggered junction :445
## Crossroads :154
## Roundabout :149
## Not at junction or within 20 metres:142
## Private drive or entrance : 37
## Mini-roundabout : 27
## (Other) : 64
## Junction_Control Second_Road_Number
## Authorised person : 0 Min. : -1.0
## Auto traffic signal :180 1st Qu.: 0.0
## Data missing or out of range:142 Median : 0.0
## Give way or uncontrolled :689 Mean : 584.7
## Stop sign : 7 3rd Qu.: 111.8
## Max. :9501.0
##
## Pedestrian_Crossing-Human_Control
## Control by other authorised person: 5
## Control by school crossing patrol : 0
## Data missing or out of range : 0
## None within 50 metres :1013
##
##
##
## Pedestrian_Crossing-Physical_Facilities
## Central refuge : 33
## Data missing or out of range : 0
## Footbridge or subway : 7
## No physical crossing facilities within 50 metres :713
## Pedestrian phase at traffic signal junction :140
## Pelican, puffin, toucan or similar non-junction pedestrian light crossing: 75
## Zebra : 50
## Light_Conditions Weather_Conditions
## Darkness - lighting unknown: 35 Fine no high winds :776
## Darkness - lights lit :820 Raining no high winds:143
## Darkness - lights unlit : 4 Unknown : 27
## Darkness - no lighting : 37 Raining + high winds : 25
## Daylight :122 Fine + high winds : 20
## Other : 18
## (Other) : 9
## Road_Surface_Conditions
## Data missing or out of range: 1
## Dry :647
## Flood over 3cm. deep : 0
## Frost or ice : 16
## Snow : 6
## Wet or damp :348
##
## Special_Conditions_at_Site
## None :1003
## Roadworks : 7
## Auto traffic signal - out : 5
## Mud : 1
## Road sign or marking defective or obscured: 1
## Road surface defective : 1
## (Other) : 0
## Carriageway_Hazards Urban_or_Rural_Area
## Any animal in carriageway (except ridden horse): 6 Rural:158
## Data missing or out of range : 0 Urban:860
## None :1006
## Other object on road : 2
## Pedestrian in carriageway - not injured : 3
## Previous accident : 0
## Vehicle load on road : 1
## Did_Police_Officer_Attend_Scene_of_Accident
## No :179
## No - accident was reported using a self completion form (self rep only): 3
## Yes :836
##
##
##
##
## Season Month Hour Day_Period
## Fall :288 Jan :226 Min. : 0.00 Night :114
## Spring:135 Feb :140 1st Qu.:17.00 Morning : 27
## Summer:113 Dec :125 Median :19.00 Afternoon:285
## Winter:482 Nov :112 Mean :17.27 Evening :592
## Mar :104 3rd Qu.:21.00
## Oct : 68 Max. :23.00
## (Other):243
## Casualties_Class Number_Vehicles_Class cluster
## Small :972 Small :957 Min. :2
## Medium : 38 Medium: 61 1st Qu.:2
## Large : 8 Large : 0 Median :2
## Overweelming: 0 Mean :2
## 3rd Qu.:2
## Max. :2
##
##
## [[3]]
## Longitude Latitude Police_Force Accident_Severity
## Min. :-5.43716 Min. :50.11 Metropolitan Police:643 Fatal : 13
## 1st Qu.:-1.64347 1st Qu.:51.45 Thames Valley : 75 Serious: 234
## Median :-0.40139 Median :51.57 Sussex : 66 Slight :1637
## Mean :-0.92433 Mean :52.17 Hampshire : 60
## 3rd Qu.:-0.09699 3rd Qu.:52.90 Kent : 58
## Max. : 1.74594 Max. :57.64 Surrey : 54
## (Other) :928
## Number_of_Vehicles Day_of_Week Local_Authority_District
## Min. :1.000 Friday :465 Westminster: 39
## 1st Qu.:2.000 Monday :196 Lambeth : 33
## Median :2.000 Saturday :223 Brent : 32
## Mean :1.902 Sunday :184 Wandsworth : 31
## 3rd Qu.:2.000 Thursday :313 Croydon : 29
## Max. :6.000 Tuesday :250 Islington : 28
## Wednesday:253 (Other) :1692
## First_Road_Class First_Road_Number Road_Type
## A :1144 Min. : 0 Dual carriageway : 233
## A(M) : 5 1st Qu.: 11 One way street : 34
## B : 220 Median : 238 Roundabout : 213
## C : 186 Mean :1087 Single carriageway:1379
## Motorway : 23 3rd Qu.:1041 Slip road : 20
## Unclassified: 306 Max. :9125 Unknown : 5
##
## Speed_limit Junction_Detail
## 30 :1398 T or staggered junction :921
## 40 : 166 Roundabout :259
## 60 : 128 Crossroads :249
## 50 : 77 Not at junction or within 20 metres:194
## 70 : 63 Private drive or entrance :101
## 20 : 52 Other junction : 54
## (Other): 0 (Other) :106
## Junction_Control Second_Road_Number
## Authorised person : 1 Min. : -1.0
## Auto traffic signal : 296 1st Qu.: 0.0
## Data missing or out of range: 195 Median : 0.0
## Give way or uncontrolled :1379 Mean : 531.0
## Stop sign : 13 3rd Qu.: 112.2
## Max. :9556.0
##
## Pedestrian_Crossing-Human_Control
## Control by other authorised person: 7
## Control by school crossing patrol : 8
## Data missing or out of range : 0
## None within 50 metres :1869
##
##
##
## Pedestrian_Crossing-Physical_Facilities
## Central refuge : 49
## Data missing or out of range : 0
## Footbridge or subway : 7
## No physical crossing facilities within 50 metres :1379
## Pedestrian phase at traffic signal junction : 226
## Pelican, puffin, toucan or similar non-junction pedestrian light crossing: 136
## Zebra : 87
## Light_Conditions Weather_Conditions
## Darkness - lighting unknown: 10 Fine no high winds :1632
## Darkness - lights lit : 63 Raining no high winds: 154
## Darkness - lights unlit : 3 Unknown : 33
## Darkness - no lighting : 11 Other : 21
## Daylight :1797 Fine + high winds : 16
## Raining + high winds : 13
## (Other) : 15
## Road_Surface_Conditions
## Data missing or out of range: 4
## Dry :1505
## Flood over 3cm. deep : 1
## Frost or ice : 13
## Snow : 5
## Wet or damp : 356
##
## Special_Conditions_at_Site
## None :1852
## Roadworks : 18
## Oil or diesel : 6
## Auto traffic signal - out : 3
## Road surface defective : 2
## Auto signal part defective: 1
## (Other) : 2
## Carriageway_Hazards Urban_or_Rural_Area
## Any animal in carriageway (except ridden horse): 3 Rural: 438
## Data missing or out of range : 1 Urban:1446
## None :1867
## Other object on road : 7
## Pedestrian in carriageway - not injured : 3
## Previous accident : 3
## Vehicle load on road : 0
## Did_Police_Officer_Attend_Scene_of_Accident
## No : 375
## No - accident was reported using a self completion form (self rep only): 2
## Yes :1507
##
##
##
##
## Season Month Hour Day_Period
## Fall :387 Jul :410 Min. : 0.00 Night : 110
## Spring:254 Sep :246 1st Qu.: 8.00 Morning :1263
## Summer:909 Aug :226 Median :11.00 Afternoon: 423
## Winter:334 Jun :152 Mean :10.97 Evening : 88
## Mar :139 3rd Qu.:13.00
## Nov :135 Max. :23.00
## (Other):576
## Casualties_Class Number_Vehicles_Class cluster
## Small :1800 Small :1714 Min. :3
## Medium : 71 Medium: 170 1st Qu.:3
## Large : 12 Large : 0 Median :3
## Overweelming: 1 Mean :3
## 3rd Qu.:3
## Max. :3
##
##
## [[4]]
## Longitude Latitude Police_Force Accident_Severity
## Min. :-5.4311 Min. :50.17 West Midlands :197 Fatal : 10
## 1st Qu.:-2.3917 1st Qu.:51.51 Metropolitan Police: 81 Serious: 169
## Median :-1.7967 Median :52.47 Hampshire : 54 Slight :1058
## Mean :-1.6770 Mean :52.58 Devon and Cornwall : 43
## 3rd Qu.:-0.8813 3rd Qu.:53.42 Greater Manchester : 42
## Max. : 1.7537 Max. :57.48 Lancashire : 42
## (Other) :778
## Number_of_Vehicles Day_of_Week Local_Authority_District
## Min. :1.000 Friday :116 Birmingham : 111
## 1st Qu.:2.000 Monday :356 Coventry : 18
## Median :2.000 Saturday :160 Sandwell : 16
## Mean :1.863 Sunday :131 Bristol, City of: 15
## 3rd Qu.:2.000 Thursday :188 Liverpool : 15
## Max. :5.000 Tuesday :123 Nottingham : 14
## Wednesday:163 (Other) :1048
## First_Road_Class First_Road_Number Road_Type
## A :264 Min. : 0 Dual carriageway : 82
## A(M) : 4 1st Qu.: 0 One way street : 30
## B :189 Median : 5 Roundabout : 82
## C : 95 Mean :1069 Single carriageway:1029
## Motorway : 9 3rd Qu.: 813 Slip road : 5
## Unclassified:676 Max. :9304 Unknown : 9
##
## Speed_limit Junction_Detail
## 30 :1022 T or staggered junction :702
## 40 : 73 Crossroads :163
## 60 : 54 Roundabout :113
## 20 : 53 Not at junction or within 20 metres: 89
## 50 : 20 Private drive or entrance : 67
## 70 : 15 Other junction : 47
## (Other): 0 (Other) : 56
## Junction_Control Second_Road_Number
## Authorised person : 0 Min. : -1.0
## Auto traffic signal : 118 1st Qu.: 0.0
## Data missing or out of range: 89 Median : 0.0
## Give way or uncontrolled :1026 Mean : 516.5
## Stop sign : 4 3rd Qu.: 0.0
## Max. :9219.0
##
## Pedestrian_Crossing-Human_Control
## Control by other authorised person: 6
## Control by school crossing patrol : 3
## Data missing or out of range : 0
## None within 50 metres :1228
##
##
##
## Pedestrian_Crossing-Physical_Facilities
## Central refuge : 38
## Data missing or out of range : 0
## Footbridge or subway : 2
## No physical crossing facilities within 50 metres :1013
## Pedestrian phase at traffic signal junction : 83
## Pelican, puffin, toucan or similar non-junction pedestrian light crossing: 75
## Zebra : 26
## Light_Conditions Weather_Conditions
## Darkness - lighting unknown: 16 Fine no high winds :1034
## Darkness - lights lit : 136 Raining no high winds: 117
## Darkness - lights unlit : 4 Unknown : 26
## Darkness - no lighting : 18 Fine + high winds : 20
## Daylight :1063 Other : 18
## Raining + high winds : 16
## (Other) : 6
## Road_Surface_Conditions
## Data missing or out of range: 3
## Dry :975
## Flood over 3cm. deep : 0
## Frost or ice : 2
## Snow : 1
## Wet or damp :256
##
## Special_Conditions_at_Site
## None :1221
## Roadworks : 8
## Road sign or marking defective or obscured: 4
## Oil or diesel : 2
## Auto traffic signal - out : 1
## Road surface defective : 1
## (Other) : 0
## Carriageway_Hazards Urban_or_Rural_Area
## Any animal in carriageway (except ridden horse): 0 Rural: 209
## Data missing or out of range : 0 Urban:1028
## None :1229
## Other object on road : 5
## Pedestrian in carriageway - not injured : 2
## Previous accident : 1
## Vehicle load on road : 0
## Did_Police_Officer_Attend_Scene_of_Accident
## No :281
## No - accident was reported using a self completion form (self rep only): 6
## Yes :950
##
##
##
##
## Season Month Hour Day_Period
## Fall :243 May :287 Min. : 0.00 Night : 52
## Spring:622 Apr :156 1st Qu.:13.00 Morning :199
## Summer:172 Jun :143 Median :15.00 Afternoon:868
## Winter:200 Mar :122 Mean :14.62 Evening :118
## Nov : 91 3rd Qu.:17.00
## Sep : 82 Max. :23.00
## (Other):356
## Casualties_Class Number_Vehicles_Class cluster
## Small :1151 Small :1134 Min. :4
## Medium : 74 Medium: 103 1st Qu.:4
## Large : 12 Large : 0 Median :4
## Overweelming: 0 Mean :4
## 3rd Qu.:4
## Max. :4
##
##
## [[5]]
## Longitude Latitude Police_Force Accident_Severity
## Min. :-5.2256 Min. :50.23 West Yorkshire :175 Fatal : 18
## 1st Qu.:-2.6191 1st Qu.:51.57 Metropolitan Police: 96 Serious: 219
## Median :-1.5835 Median :53.05 Strathclyde : 68 Slight :1186
## Mean :-1.6915 Mean :52.97 Lancashire : 60
## 3rd Qu.:-0.7561 3rd Qu.:53.78 Merseyside : 57
## Max. : 1.7336 Max. :58.59 Thames Valley : 55
## (Other) :912
## Number_of_Vehicles Day_of_Week Local_Authority_District
## Min. : 1.00 Friday :208 Leeds : 75
## 1st Qu.: 1.00 Monday :171 Kirklees : 30
## Median : 2.00 Saturday :187 Liverpool : 28
## Mean : 1.75 Sunday :141 Bradford : 27
## 3rd Qu.: 2.00 Thursday :230 Wakefield : 27
## Max. :10.00 Tuesday :340 Glasgow City: 25
## Wednesday:146 (Other) :1211
## First_Road_Class First_Road_Number Road_Type
## A :349 Min. : 0.0 Dual carriageway : 191
## A(M) : 12 1st Qu.: 0.0 One way street : 47
## B :165 Median : 19.0 Roundabout : 28
## C : 93 Mean : 984.9 Single carriageway:1123
## Motorway : 73 3rd Qu.: 649.5 Slip road : 14
## Unclassified:731 Max. :9711.0 Unknown : 20
##
## Speed_limit Junction_Detail
## 30 :1072 Not at junction or within 20 metres:1184
## 70 : 92 Crossroads : 84
## 40 : 86 T or staggered junction : 56
## 20 : 77 Roundabout : 36
## 60 : 64 Private drive or entrance : 26
## 50 : 32 Other junction : 17
## (Other): 0 (Other) : 20
## Junction_Control Second_Road_Number
## Authorised person : 1 Min. : -1.0
## Auto traffic signal : 82 1st Qu.: 0.0
## Data missing or out of range:1184 Median : 0.0
## Give way or uncontrolled : 152 Mean : 85.6
## Stop sign : 4 3rd Qu.: 0.0
## Max. :7201.0
##
## Pedestrian_Crossing-Human_Control
## Control by other authorised person: 3
## Control by school crossing patrol : 5
## Data missing or out of range : 3
## None within 50 metres :1412
##
##
##
## Pedestrian_Crossing-Physical_Facilities
## Central refuge : 27
## Data missing or out of range : 3
## Footbridge or subway : 1
## No physical crossing facilities within 50 metres :1216
## Pedestrian phase at traffic signal junction : 70
## Pelican, puffin, toucan or similar non-junction pedestrian light crossing: 62
## Zebra : 44
## Light_Conditions Weather_Conditions
## Darkness - lighting unknown: 16 Fine no high winds :1157
## Darkness - lights lit : 258 Raining no high winds: 155
## Darkness - lights unlit : 8 Unknown : 29
## Darkness - no lighting : 57 Fine + high winds : 24
## Daylight :1084 Raining + high winds : 24
## Other : 18
## (Other) : 16
## Road_Surface_Conditions
## Data missing or out of range: 5
## Dry :1025
## Flood over 3cm. deep : 2
## Frost or ice : 6
## Snow : 5
## Wet or damp : 380
##
## Special_Conditions_at_Site
## None :1378
## Roadworks : 23
## Mud : 7
## Oil or diesel : 4
## Road surface defective : 4
## Data missing or out of range: 3
## (Other) : 4
## Carriageway_Hazards Urban_or_Rural_Area
## Any animal in carriageway (except ridden horse): 6 Rural: 305
## Data missing or out of range : 3 Urban:1118
## None :1387
## Other object on road : 19
## Pedestrian in carriageway - not injured : 5
## Previous accident : 1
## Vehicle load on road : 2
## Did_Police_Officer_Attend_Scene_of_Accident
## No : 367
## No - accident was reported using a self completion form (self rep only): 8
## Yes :1048
##
##
##
##
## Season Month Hour Day_Period
## Fall :727 Oct :320 Min. : 0.00 Night : 88
## Spring:179 Nov :193 1st Qu.:10.00 Morning :423
## Summer:282 Dec :163 Median :14.00 Afternoon:759
## Winter:235 Sep :140 Mean :13.53 Evening :153
## Jun :101 3rd Qu.:17.00
## Aug :100 Max. :23.00
## (Other):406
## Casualties_Class Number_Vehicles_Class cluster
## Small :1344 Small :1280 Min. :5
## Medium : 67 Medium: 142 1st Qu.:5
## Large : 12 Large : 1 Median :5
## Overweelming: 0 Mean :5
## 3rd Qu.:5
## Max. :5
##
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering))
ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = cluster))
# Create the array list to iterate
# Considering that we are subsetting, we can handle the subset all. condideting dates, lets make motn by mothn
# Lets set out array to iterate
month_array <- c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')
# Iterate and do things
for ( month in month_array){
print(month)
# Some intial colum name analisis
(colnames(df_subset))
# Lets drop initialy the time to see things
df_notime_temp <- df_subset[, -c('Date', 'Time', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
'LSOA_of_Accident_Location','Second_Road_Class', 'Number_of_Casualties')]
# Access the number of levels
(apply(df_notime_temp, 2, function(x) length( unique(x) )))
# Access the incomplete cases
(apply(is.na(df_notime_temp), 2, sum))
# Count incomplete cases
sum(apply(df_notime_temp, 1, function(x){any(is.na(x))}))
# Remove those incomplete cases in small number
df_notime_temp <- na.omit(df_notime_temp)
# Convert chars to factors in this reduced space
df_notime_temp <- mutate_if(df_notime_temp, is.character, as.factor)
# Compute Gower distance
gower_dist <- daisy(df_notime_temp, metric = "gower")
gower_mat <- as.matrix(gower_dist)
# Do pam clustering Medoid
k <- 5
pam_fit <- pam(gower_dist, diss = TRUE, k)
pam_results <- df_notime_sample %>%
mutate(cluster = pam_fit$clustering) %>%
group_by(cluster) %>%
do(the_summary = summary(.))
pam_results$the_summary
# Do some representation
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering))
ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = cluster))
}